Monday, December 5, 2016

many-to-many SQL join using DATA step

As we know, Data step and SQL will get different results as to many-to-many join. SQL join is powerful but may be time-consuming when processing huge data, e.g. billion records. Here is one simple tip to implement SQL join using DATA step. If we take advantage of sorted datasets, it will get very good performance benefit.
data a;
do i=1,1,2,2,3,3;
    a=1;
    output;
end;
run;

data b;
    do i=1,1,1,2,2,2;
        b = 0;
        output;
    end;
run;

/* SQL step */
proc sql;
    create table c_sql as
    select a.i, a.a, b.b
    from a, b
    where a.i = b.i;
quit;

/* Data step - which can match SQL result */
data c_data;
    set a;

    drop i_b;

    do ai=1 to n;
        set b(rename=(i=i_b)) nobs=n point=ai;
        if i=i_b then do; 
            output;
        end;
    end;
run;