Appendix B: SAS Code Examples for Statistical Foundations

Quick Navigation:

1. Data Import and Summary Statistics

Most SAS procedures require your dataset to be sorted by group for grouped analysis or plotting. You can either import data from a file or create a dataset manually for small examples.

1A. Import or Create a Dataset

Code

/* OPTION 1: Import a CSV from your SAS Studio home directory */

%web_drop_table(WORK.mydata);

filename reffile '/home/your-username/path-to-your-data.csv';

proc import datafile=reffile
    dbms=csv
    out=WORK.mydata
    replace;
    getnames=yes;
run;

/* View structure and preview the dataset */
proc contents data=WORK.mydata;
run;

%web_open_table(WORK.mydata);
    
    
/* OPTION 2: Manually create a small dataset column-wise (each row = one observation) */

data mydata;
    input variable1 variable2;
    datalines;
    1 A
    2 B
    3 C
    4 D
    5 E
    ;
run;

proc print data = mydata;
run;


/* OPTION 3: Create a row-wise dataset (data entered horizontally using @@) */
data mydata;
    input variable @@;
    datalines;
    1 2 3 4 5 6
    ;
run;
    
proc print data = mydata;
run;
    

1B. Summary View and Sorting

Code

/* Preview the data */
proc print data = dataset;
run;

/* Sort data by explanatory variable (required for some procedures) */
proc sort data = dataset;
    by explanatory;
run;

1C. Summary Statistics

Code

/* Summary stats: n, mean, sd, min, max */
proc means data = dataset;
    var response;
run;

/* Medians by group */
proc means data = dataset median;
    class explanatory;
    var response;
run;

/* Manual p-value from t-statistic (two-sided) */
data mypval;
    pv = 2 * (1 - cdf("t", tstat, df));
run;
proc print data = mypval;
run;

1D. Log Transformation and Filtering

Code

/* Create a log-transformed version of response */
data ldataset;
    set dataset;
    logResponse = log(response);
run;

/* Remove outlier (e.g., filter response < 1200) */
data datasetNoOutlier;
    set dataset;
    where response < 1200;
run;

/* Scatter plot without outlier */
proc sgplot data = datasetNoOutlier;
    scatter x = explanatory y = response;
run;

1E. Categorical Recoding and Viewing Subsets

Code

/* Create ordinal version of categorical variable */

data educMultiGrp;
    set educMultiGrp;
    if Educ = '<12' then EO = 1;
    else if Educ = '12' then EO = 2;
    else if Educ = '13-15' then EO = 3;
    else if Educ = '16' then EO = 4;
    else if Educ = '>16' then EO = 5;
run;

/* Create a column to collapse levels */
data educMultiGrp;
    set educMultiGrp;
    if Educ = "<12" then Combine = "<12";
    else if Educ = "12" then Combine = "12";
    else if Educ = "13-15" then Combine = "13-15";
    else if Educ in ("16", ">16") then Combine = "16+";
run;

/* View first 50 rows of a dataset */
proc print data = lIncome5Grps(obs=50);
run;

2. Visualization and Assumption Checks

Use proc univariate, proc sgplot, and proc ttest to visualize distributions and assess assumptions of normality and variance.
Many visualizations require sorting your dataset by the grouping variable.

2A. Univariate Plots (No Grouping)

Code

/* Histogram and QQ plot (ungrouped) */
proc univariate data = dataset;
    var response;
    histogram response;
    qqplot response;
run;

/* Boxplot (ungrouped) */
proc sgplot data = dataset;
    vbox response;
run;

/* Scatterplot of response vs explanatory */
proc sgplot data = dataset;
    scatter x = explanatory y = response;
    * title "Scatterplot of ";
run;

/* Turn off the title */
title;
run;

2B. Histograms and QQ Plots by Group

Code

/* Sort by grouping variable (required for BY statement) */
proc sort data = dataset;
    by explanatory;
run;
proc print data = dataset;
run;

/* Histograms grouped by explanatory */
proc univariate data = dataset;
    by explanatory;
    histogram response;
run;

/* QQ plots grouped by explanatory */
proc univariate data = dataset;
    * title "QQ Plot by ";
    by explanatory;
    qqplot response / normal(mu=est sigma=est);
run;


proc sgplot data = dataset;
    * title "Histogram by ";
    by explanatory;
    histogram response / transparency=0.5;
    density response; * with a normal distribution curve;
run;

2C. Boxplots and Density Plots by Group

Code

/* Boxplot by group */
proc sgplot data=dataset;
    * title "Boxplot of";
    vbox response / category=explanatory;
run;

/* Histogram with overlaid normal curve by group */
proc sgplot data = dataset;
    by explanatory;
    histogram response / transparency = 0.5;
    density response;  /* Normal curve */
run;

2D. Visualizations from proc ttest

Code

/* `proc ttest` also generates histogram, QQ plot, and boxplot (grouped) */
proc ttest data = dataset;
    class explanatory;
    var response;
run;

2E. Custom Histogram with Set Bin Width

Code

/* Adjust histogram range and bin width */
proc univariate data = dataset noprint;
    by explanatory;
    histogram response / endpoints = 0 to 1000 by 50 normal; /* Adjust as needed */
    inset n mean std / pos = ne;
run;

3. Permutation Test

3A. Observed Difference in Means

Code

/* Use proc ttest to compute observed difference between groups */
proc ttest data = dataset;
    class group;
    var response;
run;

3B. Generate Permutations with proc iml

Code

/* Shuffle response values across group labels using IML */

/* Borrowed code from internet: randomizes observations and creates a matrix with one row per permutation */

/* NOTE: Replace `dataset`, `group` and `response` with your actual variable names */

proc iml;
    use dataset;
    read all var {group response} into x;  /* Make sure to specify class variable first */
    
      nPerms = 10000;                 /* Number of permutations */
    permuted = t(ranperm(x[,2], nPerms));  /* Create permuted response matrix */
    combined = x[,1] || permuted;   /* Combine original group column with each permuted column */

    create newds from combined;
    append from combined;
quit;

3C. Build Permutation Distribution

Code

/* Use proc ttest to calculate mean differences for each permutation */

ods output off;
ods exclude all;   /* Suppress output */
  
/* The ods output line below is optional if you want to capture confidence intervals */
/* ods output conflimits=diff; */

proc ttest data = newds plots = none;
    class col1;
    var col2 - col10001;
run;

ods output on;
ods exclude none;

3D. Plot Permutation Histogram and Calculate p-value

Code
                                                                    
/* Plot the distribution of permuted mean differences (Pooled method only) */
proc univariate data = diff;
    where method = "Pooled";
    var mean;
    histogram mean;
run;

/* Count number of permuted differences as or more extreme than observed */
/* Replace `obsDiff` with your observed test statistic from 3A */
/* Adjust for one-tailed or two-tailed test accordingly */
data numdiffs;
    set diff;
    where method = "Pooled";
    if abs(mean) >= abs(obsDiff);
run;

/* Print matching rows (just for visual check) */
proc print data = numdiffs;
    where method = "Pooled";
run;    

/* Manual Step:
   - Check the number of rows in `numdiffs` (from the log or output)
   - Divide that by the total number of permutations (e.g., 10000)
   - This gives your p-value */

4. t-Tests and Power Analysis

This section includes basic t-test syntax, F-tests for equal variances, and power analysis using proc power.

4A. Basic t-Tests and Critical Values

Code

/* Compute critical t-value for two-sided test at α = 0.05 */
data criticalvalue;
    critval = quantile("T", 0.975, df);
run;
proc print data = criticalvalue;
run;

/* One-sample t-test (H0: mean = muUnderNull) */
/* * proc ttest outputs histogram, boxplot and qqplot */
proc ttest data = dataset h0 = muUnderNull sides = 2 alpha = 0.05;
    var response;
run;

/* Paired t-test (H0: mean difference = 0, computed as Group2 - Group1) */
proc ttest data = dataset;
    paired explanatory2*explanatory1;
run;

/* Two-sample t-test (equal variance and Welch's handled together) */
/* Pooled uses pooled st.dev. and Satterthwaite uses Welch's */
proc ttest data = dataset h0 = 0 sides = 2 alpha = 0.05;
    class explanatory;
    var response;
run;

4B. Equality of Variances Tests

Code

/* F-test for equality of variances is run automatically as part of proc ttest */
/* The F-test (default in proc ttest) assumes normal distributions. It's useful as secondary evidence for variance equality if visuals are inconclusive. Null: population variances are equal. */

/* Brown-Forsythe test does not require normality assumptions */
proc glm data = dataset;
    class explanatory;
    model response = explanatory;
    means explanatory / hovtest = bf;  /* Brown-Forsythe test */
run;  

4C. Power Analysis (One- and Two-Sample)

Code

/* Power for one-sample t-test */
proc power;
    onesamplemeans
        sides = 1
        alpha = 0.05
        nullmean = 0
        mean = xbar
        stddev = s
        ntotal = n
        power = .;
run;

/* Power for two-sample t-test */
proc power;
    twosamplemeans
        sides = 1
        alpha = 0.05
        meandiff = effectSize
        stddev = s
        npergroup = n1
        power = .; /* specify a dot or omit this line to solve for the unknown parameter */

run;

/* Example: specify total sample size and mean for each using `nullmean and meandiff` */
proc power;
    twosamplemeans
        sides = 1
        alpha = 0.05
        nulldiff = 0
        meandiff = 3
        stddev = 4.5
        ntotal = 47;  /* e.g. 24 in group 1, 23 in group 2 */
run;

4D. Power Curve Plots

Code

/* Power vs. sample size (one-sample) */
ods graphics on;

proc power;
    onesamplemeans
        sides = 1
        alpha = 0.05
        nullmean = 0
        ntotal = 60 80
        mean = 0.07
        stddev = 0.2
        power = .;
    plot x = n min = 60 max = 80;
run;

ods graphics off;

/* Power vs. effect size (two-sample) */
ods graphics on;

proc power;
    twosamplemeans
        sides = 1
        alpha = 0.05
        nulldiff = 0
        meandiff = 3 to 5 by 0.1
        ntotal = 47
        stddev = 4.5
        power = .;
    plot x = effect min = 3 max = 5;
run;

ods graphics off;
        

5. ANOVA and Extra Sum of Squares

This section includes one-way ANOVA, multiple comparisons, linear contrasts, and extra sum of squares tests for nested or grouped comparisons.

5A. One-Way ANOVA with Variance Checks

Code

/* One-way ANOVA: for EMM and SMM (full), tests if any group means differ (i.e. any pair has difference of means) */
/* Brown-Forsythe test is included to check equal variance assumption (does not require normality) */
proc glm data = dataset;
    class explanatory;
    model response = explanatory;
    means explanatory / hovtest = bf;  /* Brown-Forsythe test (homogeneity of variance) */
run;

/* Manually compute F critical value */
data critical_value;
    set calculations;
    critical_value = finv(1 - alpha, dfn, dfd);  /* right-tailed (1-alpha) */
run;

5B. Multiple Comparisons (Tukey + Pairwise Tests)

Code

/* Example: pairwise t-tests with Tukey adjustment and confidence intervals */
proc glm data = playersHeight;
    class Sport;
    model Height = Sport;
    means Sport / hovtest = bf;
    lsmeans Sport / pdiff adjust = tukey cl;
run;

/* Optionally show differences both directions (A–B and B–A) */
proc glm data = playersHeight;
    class Sport;
    model Height = Sport;
    means Sport / hovtest = bf tukey cldiff;
run;

/* The cl option provides CIs for the means, 
 while cldiff provides CIs for the differences between means. */

5C. Linear Contrasts

Code

/* Test linear contrasts using CONTRAST and ESTIMATE statements.
   CONTRAST is used to test hypotheses (produces sum of squares and F-statistic).
   ESTIMATE provides a point estimate and standard error (used for CIs).
   These allow comparisons of one group mean to a linear combination of others. */

/* Note: SAS orders categorical levels alphabetically by default.
   To retain original data order (e.g., to match planned contrast coefficients),
   use 'order = data' in PROC GLM:
   proc glm data = Handicap order = data; */  
  
proc glm data = dataset;
    class explanatory;
    model response = explanatory;
    means explanatory;
   
    /* Grp1 vs sum or average of others (assuming 5 groups) */
    contrast 'Grp1 vs Sum of Other 4 Groups' explanatory 4 -1 -1 -1 -1;
    estimate 'Grp1 vs Sum of Other 4 Groups' explanatory 4 -1 -1 -1 -1;
    estimate 'Grp1 vs Avg of Other 4 Groups' explanatory 4 -1 -1 -1 -1 / divisor = 4;

    /* Grp1 vs Grp2 */
    contrast 'Grp1 vs Grp2' explanatory 1 -1 0 0 0;
    estimate 'Estimate Grp1 vs Grp2' explanatory 1 -1 0 0 0;
run;  
   

/* Manually compute the critical t-value for a 95% confidence interval.
   This is used in: point estimate ± (multiplier × standard error)
*/
data quantile;
    quant = quantile("t", 0.975, df);  * where df = n − number of groups;
run;
proc print data = quantile;
run;   

5D. Bonferroni Adjustment for Selected Comparisons

Code

/* Simultaneous CIs for selected differences (e.g. μ2–μ3, μ2–μ5, μ3–μ5) */
/* k=3 (comparing 3 pairs) */

proc glm data = dataset order = data;
    class explanatory;
    model response = explanatory;
    means explanatory;
    lsmeans explanatory / pdiff cl;

    /* ESTIMATE used to extract estimate and SEs for CI construction */
    estimate 'mu2 - mu3' explanatory 0 1 -1 0 0;
    estimate 'mu2 - mu5' explanatory 0 1 0 0 -1;
    estimate 'mu3 - mu5' explanatory 0 0 1 0 -1;
run;


/* to get 95% CI for the difference in averages of above
point estimate +- multiplier * standard error */
/* Bonferroni multiplier: 1 - α/(2k) for two-tailed test with k comparisons; df=n-totalgrps */
data quantile;
    quant = quantile("t", 1 - 0.05/6, df);  /* adjust denominator for # comparisons and tails */
run;
proc print data = quantile;
run;

5E. Summary: Choosing a Multiple Comparison Adjustment

  • When choosing a multiple comparison correction:
    • Were the comparisons planned before looking at the data or unplanned? If planned, no need to make multiple comparison correction. Can use LSD.
    • Are the groups being compared to a single control or reference group? If so, use Dunnett’s.
    • Are the groups normally distributed with equal standard deviation? If so, use Tukey-Kramer.
    • If we are doing some or all unplanned pairwise comparisons, without a reference/control group and without the assumption of normality or equal variance, use Bonferroni.
Code

/* Commands for various multiple comparison methods */

* LSD (Least Significant Difference, use when comparisons are planned) with CI half-width;
proc glm data = dataset order = data;
    class explanatory;
    model response = explanatory;
    means explanatory / lsd;  * means returns the half-width;
    lsmeans explanatory / pdiff cl;
run;

* Dunnett (compare each group to a control/reference group);
proc glm data = dataset order = data;
    class explanatory;
    model response = explanatory;
    means explanatory / dunnett('Control');
    lsmeans explanatory / pdiff = control('Control') adjust = dunnett cl;
run;  
/* Calculate the half-widths: HalfWidth = (UpperCL - LowerCL) / 2 */

* Tukey-Kramer (assumes normality + equal variance; unplanned all-pairwise) with CI half-width;
proc glm data = dataset order = data;
    class explanatory;
    model response = explanatory;
    means explanatory / tukey;
    lsmeans explanatory / pdiff adjust = tukey cl;
run;

* Bonferroni (safest if assumptions are not met or comparisons are partially unplanned) with CI half-width;
proc glm data = dataset order = data;
    class explanatory;
    model response = explanatory;
    means explanatory / bon;
    lsmeans explanatory / pdiff adjust = bon cl;
run;

* View all options together (Jaren’s code);
proc glm data = dataset;
    class explanatory;
    model response = explanatory;
    means explanatory / lsd tukey dunnett bon scheffe;
run;

5F. Extra Sum of Squares F-Test

Code

/* Conduct an anova (explanatory=group) */
proc glm data = data;
class explanatory;
model response = explanatory;
run;

/* Extra-sum-of-squares */
/* Compare full and reduced models using BYOA-style calculations */
data calculations;
    alpha = 0.05;  
    dftotal = 2580;
    dfd = 2579;
    dfn = dftotal - dfd;
    ssred = 2234.10;
    ssfull = 2232.12;
    ssmodel = ssred - ssfull;
    mse = ssfull / dfd;
    msmodel = ssmodel / dfn;
    fstat = msmodel / mse;
run;

/* Compute critical F and p-value for the above hypothesis test. */
data critical_value;
    set calculations;
    critical_value = finv(1 - alpha, dfn, dfd); *1-alpha for right-tail;
run;

data f_test;
    set critical_value;
    p_value = 1 - probf(fstat, dfn, dfd); *1-probf for right-tail;
run;
proc print data = f_test;
run;

/* another way to get p-value */
data quantile;
    myquant = 1-CDF('F', fstat, dfn, dfd)
run;
proc print data = quantile;
run;

/* You can check against a contrast if relevant */
proc glm data = dataset;
    class explanatory;
    model response = explanatory;
    means explanatory;
    contrast 'Contrast Grp1 vs. Grp2' explanatory 0 0 0 1 -1 0;
    estimate 'Estimate Grp1 vs. Grp2' explanatory 0 0 0 1 -1 0;
run;

5G. Welch’s ANOVA (Unequal Variances)

Code

/* Appropriate for normal distributions with unequal variance */
proc glm data = dataset;
    class explanatory;
    model response = explanatory;
    means explanatory / welch;
run;

/* Critical F-value calculation */
data critical_value;
    critical_value = finv(1-0.05, 4, 706.2); /* 1-alpha for right-tail */
run;
proc print data = critical_value;
run;

6. Non-Parametric Tests

Note: The permutation test (Section 3) is also a non-parametric method.

6A. Wilcoxon Rank-Sum Test (Mann–Whitney U Test)

Used for inference on the median of two independent samples.

Code

/* Exact Wilcoxon Rank-Sum Test (with Hodges-Lehmann CI) */
/* Best for small samples. Computationally intensive for large samples. */
proc npar1way data = dataset wilcoxon;
    class explanatory;
    var response;
    exact HL wilcoxon;
run;

/* For one-sided alpha = 0.05, specify alpha = 0.10 to match CI to one-sided test */
/* HL = Hodges-Lehmann estimator of the median difference for confidence intervals. */
proc npar1way data = dataset wilcoxon alpha = 0.10;
    class explanatory;
    var response;
    exact HL wilcoxon;
run;


/* Rank-Sum Test: NORMAL Approximation
the larger the sample size, you can use the z approximation
the smaller, the more conservative, choose the t approximation */
/* Normal approximation for large samples */
/* z-approximation used when sample size is large. Choose t-approximation for smaller samples or to be more conservative. */
proc npar1way data = dataset wilcoxon;
    class explanatory;
    var response;
run;

/* CI using normal approximation with HL estimator (asymptotic CI version) */
/* To get the CI to match a one-sided alpha = 0.5, set alpha = 0.1 */
proc npar1way data = dataset wilcoxon HL alpha = 0.10;
    class explanatory;
    var response;
run;

/* One-sided critical value from normal (Z) distribution */
data critval;
    cv = quantile("normal", 0.95); alpha for left, 1-alpha for right;
run;
proc print data = critval;
run;

6B. Wilcoxon Signed-Rank Test (Paired Samples)

Code

/* Create difference column for paired data */
data dataset;
    set dataset;
    diff = before - after;
run;

proc print data = dataset;
run;

/* Run Wilcoxon Signed-Rank test using diff */
/* This seems to give an incorrect output for S, but close for 2-sided p-value */
proc univariate data = dataset;
    var diff;
run;

/* Critical value for normal approximation (one-sided z-distribution) */
data critval;
    cv = quantile("normal", 0.95);  /* alpha for left, 1-alpha for right */
run;
proc print data = critval;
run;

* This is extra code for the by hand calculations;
* Calculate the absolute differences and rank them;
data ranked;
    set dataset;
    abs_diff = abs(diff);
run;

6C. Manual Z-Approximation for Signed-Rank

Code

/* Step 1: Calculate absolute differences */
data ranked;
    set dataset;
    abs_diff = abs(diff);
run;

/* Step 2: Rank the absolute differences */
proc rank data=ranked out=ranked ties=mean;
    var abs_diff;
    ranks rank_abs_diff;
run;

/* Step 3: Identify and sum ranks for positive differences */
data stats;
    set ranked;
    if diff > 0 then S = rank_abs_diff;
    else S = 0;
run;

proc means data = stats sum noprint;
    var S;
    output out=sums sum(S)=sum_S n=n_obs;
run;

/* Step 4: Calculate expected mean, standard deviation, and Z-statistic */
data final;
    set sums;
    mean_S = n_obs * (n_obs + 1) / 4;
    sd_S = sqrt(n_obs * (n_obs + 1) * (2 * n_obs + 1) / 24);
    Z_statistic = (sum_S - 0.5 - mean_S) / sd_S;
run;

proc print data = final;
    var sum_S mean_S sd_S Z_statistic;
run;

/* Step 5: Compute p-value for one-sided test */
data pvalue;
    set final; /* not sure if this line is needed */
    pv = 1 - probnorm(Z_statistic);
run;

proc print data = pvalue;
run;

6D. Kruskal–Wallis Test (For Inference on the Median of 3+ Groups)

Code

/* Appropriate when normality or equal variance assumptions are violated,
   or when sample sizes are small (CLT not reliable). 
   Works by ranking all data and running ANOVA on the ranks.
   If following up with group comparisons, use rank-sum tests. */
/* Test statistic: χ² (chi-square). For very small n, request exact p-values with the EXACT option 
   (avoids large-sample χ² approximation).*/
proc npar1way data = dataset wilcoxon;
    class explanatory;
    var response;
run;

7. Correlation and Simple Regression

This section includes Pearson correlation, simple linear regression, and options for confidence and prediction intervals.

7A. Pearson Correlation

Code

/* Pearson's R Correlation Coefficient */

/* Scatterplot of response vs. explanatory */
proc sgscatter data = dataset;
    plot response * explanatory / markerattrs = (symbol = circlefilled);
    title "Scatterplot: Response vs. Explanatory";
run;

/* T-Critical value (two-sided t-test) */
data criticalvalue;
    critval = quantile("T", 0.975, df = n - 2);
run;
proc print data = criticalvalue;
run;

/* Correlation coefficient and p-value */
proc corr data = dataset;
    var response explanatory; /* not sure if this line is needed */
run;

7B. Simple Linear Regression: Coefficients and Fit

Code

/* Linear regression: parameter estimates and summary stats */
proc reg data = dataset;
    model response = explanatory;
run;

/* Optional: set alpha to change significance level and CI width (e.g., 99%) */
proc reg data = dataset alpha = 0.01;
    model response = explanatory / clb;  /* clb = confidence limits for betas */
run;

/* Alternate method to get the parameter estimate table using proc glm */
proc glm data = dataset;
    model response = explanatory / solution;  /* shows estimate table */
run;

/* Add confidence intervals for coefficients (alternative way) */
proc glm data = dataset alpha = 0.01;
    model response = explanatory / solution clparm;
run;

7C. Confidence and Prediction Intervals for New Observations

Code

/* Example dataset with missing response values for prediction */
/* Add new data */
data dataset;
    input explanatory response;
    datalines;
    62 65
    90 64
    50 48
    35 57
    200 601
    100 146
    90 47
    95 .
    200 .
    ;
run;

/* CI for mean response (CLM = confidence limits for mean) */
proc reg data = dataset;
    model response = explanatory / clm;
run;

/* PI for individual response (CLI = confidence limits for individual) */
proc reg data = dataset;
    model response = explanatory / cli;
run;    

/* Optionally using proc glm */
proc glm data = dataset;
    model response = explanatory / solution clm;  /* confidence interval for mean */
run;

proc glm data = dataset;
    model response = explanatory / solution cli;  /* prediction interval for individual */
run;

8. Residual Analysis

This section focuses on residual diagnostics to evaluate model assumptions such as linearity, homoscedasticity, and outliers.

8A. Residual Plots and Diagnostics

SAS automatically produces residual plots (and other diagnostics) with proc reg if ODS graphics are enabled.

Code

ods graphics on;

proc reg data = dataset;
    model response = explanatory;
run;

ods graphics off;

/* If you need customized residual plots (e.g. residuals vs. predicted), use proc sgplot after outputting residuals: */

/* Save residuals and predicted values */
proc reg data = dataset;
    model response = explanatory;
    output out = diagnostics r = residual p = predicted;
run;

/* Residuals vs. Fitted Values */
proc sgplot data = diagnostics;
    scatter x = predicted y = residual;
    refline 0 / axis = y;
run;

/* Residuals vs. Explanatory Variable */
proc sgplot data = diagnostics;
    scatter x = explanatory y = residual;
    refline 0 / axis = y;
run;

8B. Studentized Residuals

Studentized residuals help detect outliers by standardizing residuals using their estimated standard errors.

Code

/* Use PROC REG to compute studentized residuals */
proc reg data = dataset;
    model response = explanatory;
    output out = diagnostics
        rstudent = studentized_resid  /* Studentized residuals */
        h = leverage;                 /* Leverage values (optional) */
run;

proc print data = diagnostics;
    var response explanatory studentized_resid leverage;
run;

/* Optional: plot studentized residuals */
proc sgplot data = diagnostics;
    scatter x = explanatory y = studentized_resid;
    refline 0 / axis = y;
run;

8C. Influence Diagnostics

Influence diagnostics help identify observations that have an unusually large effect on the model. These include leverage, Cook’s distance, and DFFITS.

Code

/* Influence statistics from PROC REG */
ods graphics on;

proc reg data = dataset;
    model response = explanatory;
    output out = influence_out
        rstudent = studentized_resid
        cookd = cooks_d
        dffits = dffits_val
        h = leverage;
run;

ods graphics off;

proc print data = influence_out;
    var response explanatory studentized_resid cooks_d dffits_val leverage;
run;

/* Visualize Cook’s Distance for all observations */
proc sgplot data = influence_out;
    scatter x = _N_ y = cooks_d;
    refline 4 / axis = y;  /* Optional threshold line (e.g. rule of thumb for large values) */
run;