/********************************************************************************************************************* Author: Harry Droogendyk harry@stratia.ca Date: 2008-11-28 SAS macro to provide analysis of table contents, SAS data or RDBMS. See the macro document for more information. To display macro documentation, invoke the macro with a single positional parameter: %qc_db_data(?) ---------------------------------------------------------------------------------------------------------------------- Modifications: ---------------------------------------------------------------------------------------------------------------------- Date Who What ---------------------------------------------------------------------------------------------------------------------- 2010-04-21 Droogendyk Ensure &max_length has a value when no column formats found 2013-06-05 Droogendyk Merge in dataset labels if available /*********************************************************************************************************************/ %macro qc_db_data(help, lib =, table =, drop_columns =, keep_columns =, by_vars =, where =, freq_limit = 100 ); %if %length(&help) > 0 %then %do; %put %nrstr(); %put %nrstr(==================================================================================================); %put %nrstr(%qc_db_data( help, lib=, table=, drop_columns=, keep_columns=, by_vars=, where=, freq_limit = 100 ) ); %put %nrstr(); %put %nrstr(QC / analyze the SAS or RDBMS table specified, creating frequency distributions or min, max, mean, ); %put %nrstr(stddev and sum depending on the column type and granularity of the data in the table. ); %put %nrstr(); %put %nrstr(Parms:); %put %nrstr( help any value in the sole positional parameter provides this help text ); %put %nrstr( lib SAS libref that contains &table ( including RDBMS tables if approp engine is used for schema ) ); %put %nrstr( table table to be analyzed, MUST be sorted by &by_vars ( if specified ) ); %put %nrstr( drop_columns comma-delimited, single-quoted column names to be IGNORED in analysis,); %put %nrstr( - must use %str('col1','col2') when specifying multiple column names); %put %nrstr( - always specify 'acct_id', 'cust_id' type fields in this parm); %put %nrstr( keep_columns comma-delimited, single-quoted column names to be considered for analysis,); %put %nrstr( - must use %str('col1','col2') when specifying multiple column names); %put %nrstr( by_vars comma-delimited, single-quoted column names for BY groups); %put %nrstr( - must use %str('col1','col2') when specifying multiple column names); %put %nrstr( where WHERE clause to apply to input &schema..&table to focus analysis); %put %nrstr( freq_limit upper limit of number of distinct values used to decide which vars generate ); %put %nrstr( frequence distributions, default is 100 distinct values); %put %nrstr( - all columns with <= &freq_limit distinct values will generate freq dist); %put %nrstr( - num columns with > &freq_limit distinct values will generate num analysis); %put %nrstr(); %put %nrstr(Macro logic outlined below:); %put %nrstr(); %put %nrstr( 1. Derive table columns using PROC CONTENTS data=&lib..&table, incorporate &drop_column ); %put %nrstr( and &keep_column criteria); %put %nrstr( 2. count distinct values for all selected fields); %put %nrstr( 3. numeric fields where count of distinct values > &freq_limit, create min/max/stddev/sum stats); %put %nrstr( 4. run frequency distribution on any fields that have <= &freq_limit distinct values); %put %nrstr( 5. if &by_vars are specified, all stats will be created with the BY groups specified); %put %nrstr( 6. create datasets of final results in remwork._qc_continuous_data and ); %put %nrstr( remwork._qc_categorical_data); %put %nrstr(); %put %nrstr(Sample Invocation:); %put %nrstr(); %put %nrstr(libname libref '/some/path/'); %put %nrstr(libname libref ;); %put %nrstr(); %put %nrstr(%qc_db_data%(lib = libref,); %put %nrstr( table = qc_test,); %put %nrstr( drop_columns = %str('acct_id'),); %put %nrstr( by_vars = %str('acct_type_na'),); %put %nrstr( where = %str(acct_type_na like 'SAV%'),); %put %nrstr( freq_limit = 50); %put %nrstr( )); %put %nrstr(==================================================================================================); %put %nrstr(); %*put if by_vars specified, must be sorted by BY vars, character BY vars must use hex(char_var) ; %return; %end; %local by_vars_stmt sample ; /* Clean up results datasets before we begin */ proc datasets lib = work nodetails nolist; delete _qc: / mtype = data; run; quit; /* If BY vars have been specified, clean up the quotes so we can use the variable names in a BY statement */ %if &by_vars ne %then %do; %if &drop_columns ne %then %let drop_columns = &drop_columns, &by_vars; %else %let drop_columns = &by_vars; %let by_vars_stmt = %sysfunc(compress(&by_vars,%str(%'))); /* 'For use in BY statements */ %end; /* Identify character / numeric fields in the table. If &keep_columns / &drop_columns have been specified, we'll use that to define the columns we care deeply about. Creating a SAS table of the RDBMS table columns we're interested in. We're using the format as a proxy for length since SAS will return 8 for all numeric fields, we want the actual format. */ /* harry - should we compress out _ rather than truncating ? could lessen name collisions */ proc contents data = &lib..&table out = _qc_db_columns_all ( keep = name type formatl label rename = ( name = colname ) ) noprint; run; data _qc_db_columns; set _qc_db_columns_all; %if &drop_columns > %then %do; if colname not in ( %upcase(&drop_columns) ); %end; %if &keep_columns > %then %do; if colname in ( %upcase(&keep_columns) ); %end; if type = 1 then coltype = 'N'; else coltype = 'C'; drop type; run; proc sort data = _qc_db_columns; by colname; run; /* Need the maximum variable length for a later step */ proc sql; select max(case when formatl > 0 then formatl else 32 end) into :max_length from _qc_db_columns ; %let max_length = &max_length; /* Create the count(distinct x) as x phrases. The results of these will determine whether we do freq dist on the variables */ select 'count (distinct(' || trim(colname) || ')) as ' || trim(colname) into :_qc_count_distinct separated by ',' from _qc_db_columns ; /* Count distinct values of each variable, these counts used to decide if min/max/etc.. or freqs to be done */ create table _qc_count_distinct as select &_qc_count_distinct from &lib..&table %if &where ne %then %do; where &where %end; ; quit; proc transpose data = _qc_count_distinct out = _qc_count_distinct_xpose ( rename = ( _name_ = colname col1 = cnt )) ; var _numeric_; run; /* If the count distinct has found < &freq_limit distinct values, treat the variable as a categorical variable, even if it is numeric */ %let numeric_fld_cnt = 0; %let char_fld_cnt = 0; proc sql; /* Numeric columns will be run through proc summary */ select d.colname into :numeric_cols separated by ' ' from _qc_db_columns d, _qc_count_distinct_xpose c where d.colname = c.colname and d.coltype = 'N' and c.cnt > &freq_limit ; %let numeric_fld_cnt = &sqlobs; /* Any column with < &freq_limit distinct values is freqqed. This means that some character columns will have no analysis performed on them, eg. name fields. Harry - we do need distinct values in the final dataset some how !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ select d.colname, d.colname into :char_col1 - :char_col&sysmaxlong , :char_cols separated by ' ' from _qc_db_columns d, _qc_count_distinct_xpose c where d.colname = c.colname and c.cnt <= &freq_limit ; %let char_fld_cnt = &sqlobs; quit; %if &numeric_fld_cnt = 0 and &char_fld_cnt = 0 %then %do; %put; %put No numeric or character fields on the table, that IS a mystery!! Aborting; %put; %return; %end; /* Generate numeric analysis. If the BY vvars are in play, the value of COUNT will be equal to the number of rows for each particular BY slice. */ %if &numeric_fld_cnt ne 0 %then %do; proc summary data = &lib..&table ( keep = &numeric_cols &by_vars_stmt ) nway missing ; %if &where ne %then %do; where &where; %end; var &numeric_cols; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; * DB2 does return rows in correct order for mixed-case character columns; %end; output out = _qc_metrics_num_n ( drop = _: ) n= ; output out = _qc_metrics_num_min ( drop = _: ) min= ; output out = _qc_metrics_num_max ( drop = _: ) max= ; output out = _qc_metrics_num_mean ( drop = _: ) mean= ; output out = _qc_metrics_num_stddev ( drop = _: ) stddev=; output out = _qc_metrics_num_sum ( drop = _: ) sum= ; run; /* Keeping _type_ around until now since I was unsure whether we'd want NWAY or individual summary points for each BY variable */ proc transpose data = _qc_metrics_num_n out = _qc_metrics_num_n_xpose ( drop = _label_ rename = ( _name_ = colname col1 = n )) ; var _numeric_; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; %end; run; proc transpose data = _qc_metrics_num_min out = _qc_metrics_num_min_xpose ( drop = _label_ rename = ( _name_ = colname col1 = min )) ; var _numeric_; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; %end; run; proc transpose data = _qc_metrics_num_max out = _qc_metrics_num_max_xpose ( drop = _label_ rename = ( _name_ = colname col1 = max )) ; var _numeric_; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; %end; run; proc transpose data = _qc_metrics_num_mean out = _qc_metrics_num_mean_xpose ( drop = _label_ rename = ( _name_ = colname col1 = mean )) ; var _numeric_; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; %end; run; proc transpose data = _qc_metrics_num_stddev out = _qc_metrics_num_stddev_xpose ( drop = _label_ rename = ( _name_ = colname col1 = stddev )) ; var _numeric_; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; %end; run; proc transpose data = _qc_metrics_num_sum out = _qc_metrics_num_sum_xpose ( drop = _label_ rename = ( _name_ = colname col1 = sum )) ; var _numeric_; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; %end; run; proc sort data = _qc_metrics_num_n_xpose; by &by_vars_stmt colname; run; proc sort data = _qc_metrics_num_min_xpose; by &by_vars_stmt colname; run; proc sort data = _qc_metrics_num_max_xpose; by &by_vars_stmt colname; run; proc sort data = _qc_metrics_num_mean_xpose; by &by_vars_stmt colname; run; proc sort data = _qc_metrics_num_stddev_xpose; by &by_vars_stmt colname; run; proc sort data = _qc_metrics_num_sum_xpose; by &by_vars_stmt colname; run; data _qc_continuous_data_i; merge _qc_metrics_num_n_xpose _qc_metrics_num_min_xpose _qc_metrics_num_max_xpose _qc_metrics_num_mean_xpose _qc_metrics_num_stddev_xpose _qc_metrics_num_sum_xpose; by %if &by_vars_stmt ne %then %do; &by_vars_stmt %end; colname ; format min max mean stddev sum comma24.2 n comma15. ; label colname = 'Column Name' ; run; %end; /* Loop over char fields ( or numeric vars with < &freq_limit granularity ) running a FREQ on each */ %if &char_fld_cnt ne 0 %then %do; proc freq data = &lib..&table ( keep = &char_cols &by_vars_stmt ); %if &where ne %then %do; where &where; %end; %if &by_vars_stmt ne %then %do; by &by_vars_stmt notsorted; * DB2 does not return rows in correct order ; %end; %do i = 1 %to &char_fld_cnt; tables &&char_col&i / missing out = _qc_freq_&i ( rename = ( &&char_col&i = value )) ; %end; run; %if &max_length < 32 %then %let best = best&max_length; %else %let best = best32; %do i = 1 %to &char_fld_cnt; data _qc / view = _qc; length colname $32 value $&max_length ; retain colname "&&char_col&i"; set _qc_freq_&i ( rename = ( value = _val )); /* We can't mix numeric / character fields, so convert all numeric to character. We have to use PUTC/PUTN to "hide" the format from the compiler otherwise it kaks when it sees what it thinks is a numeric format for character fields, ie. at compile time it doesn't consider the conditional stmt that would prevent such a thing from happening. */ if vtype (_val) = 'N' then do; *put "&&char_col&i - in num"; _fmt = "&best"; * if max_length is less than 16, we take the chance of losing digits !!!! ; if missing(_val) then value = 'null'; else value = putn(_val,_fmt); end; else do; *put "&&char_col&i - in char"; _fmt = "$&max_length"; value = putc(_val,_fmt); end; drop _: ; run; proc append base = _qc_categorical_data_i data = _qc force; run; %end; %end; /* 2013-06-05 - Merge in dataset labels if available */ proc sql; %if %sysfunc(exist(_qc_categorical_data_i)) %then %do; create table _qc_categorical_data as select c.label length=256, d.* from _qc_categorical_data_i d left join _qc_db_columns c on d.colname = c.colname ; %end; %if %sysfunc(exist(_qc_continuous_data_i)) %then %do; create table _qc_continuous_data as select c.label as label length=256, d.* from _qc_continuous_data_i d left join _qc_db_columns c on d.colname = c.colname ; %end; quit; %let source = %sysfunc(getoption(source)); options nosource; %put; %put --------------------------------------------------------------------------------------------; %put %nrstr(%qc_db_data has completed, please check the log for any errors.) ; %put %nrstr(Successful completition will result in the creation of two datasets:) ; %put %nrstr( _qc_categorical_data - categorical variable value distributions) ; %put %nrstr( _qc_continuous_data - continuous data analysis, eg. min, max etc...) ; %put; %put %nrstr(Data from these two tables can be viewed from the SAS Explorer, exported to Excel) ; %put %nrstr(or printed ( perhaps with appropriate ODS wrapper statements ) to create output); %put --------------------------------------------------------------------------------------------; %put ; options &source; %mend qc_db_data; /* Sample invocations %*qc_db2_data(schema = droogh2, table = cap_customer_20081209, keep_columns = %str( 'contry_region_cd','paygo_relationship','behavr_sc', 'chq_in','chq_slct_serv_in','chq_infinity_in','chq_val_in','chq_val_plus_in','chq_plan60_in','primary_chq', 'sav_in','sav_gia_in','sav_youth_in','sav_tiered_in','sav_comp_in','primary_sav', 'usd_in','borderless_in','cust_rec_keep_opt' ), by_vars = %str('efectv_dt'), where = %str(), freq_limit = 100 ); libname db2 db2 database=prd1 schema=droogh2; %qc_db_data(lib = db2, table = qc_test, drop_columns = %str('acct_id'), by_vars = %str('acct_type_na'), where = %str(acct_type_na like '%Visa%'), freq_limit = 100 ); */