Appendix A: R Code Examples for Statistical Foundations

Quick Navigation:

1. Data Import & Summary Statistics

1A. Import and Explore the Dataset

Code
# Load the tidyverse (for read, dplyr, ggplot2, etc.)
library(tidyverse)

# Import the dataset using a file dialog
dataset = read.csv(file.choose(), header = TRUE, stringsAsFactors = TRUE)

# Explore the structure and contents
str(dataset)
head(dataset)

1B. Summary Statistics by Group

Code
# Summarize response by levels of the explanatory variable
dataset %>% group_by(explanatory) %>% 
  summarize(n = n(), mean = mean(response), sd = sd(response))

2. Visualization & Assumption Checks

2A. Base R Plots (One Variable or Group Subsets)

Code
# Visual checks for distributional assumptions (normality and spread)

# Histogram, QQ plot, and Boxplot of a numerical variable
hist(dataset$numericalColumn, main = "Histogram", xlab = "Numerical Variable")
qqnorm(dataset$numericalColumn, main = "QQ Plot")
qqline(dataset$numericalColumn)
boxplot(dataset$numericalColumn, horizontal = TRUE, main = "Boxplot")

2B. Base R: Comparison Between Groups

Code
# Compare two levels of a grouping variable using base R

# Replace level1 and level2 with actual factor levels
level1 = "A"
level2 = "B"

# Layout: 2 rows, 3 columns
par(mfrow = c(2, 3))

# Level 1 plots
hist(subset(dataset, explanatory == level1)$response, main = paste("Histogram -", level1), xlab = "Response")
boxplot(subset(dataset, explanatory == level1)$response, main = paste("Boxplot -", level1), horizontal = TRUE)
qqnorm(subset(dataset, explanatory == level1)$response, main = paste("QQ Plot -", level1))
qqline(subset(dataset, explanatory == level1)$response)

# Level 2 plots
hist(subset(dataset, explanatory == level2)$response, main = paste("Histogram -", level2), xlab = "Response")
boxplot(subset(dataset, explanatory == level2)$response, main = paste("Boxplot -", level2), horizontal = TRUE)
qqnorm(subset(dataset, explanatory == level2)$response, main = paste("QQ Plot -", level2))
qqline(subset(dataset, explanatory == level2)$response)

2C. ggplot2 + patchwork for Grouped Assumption Plots

Code
# Import ggplot if tidyverse not already imported
# library(ggplot2) # in tidyverse package

# Create histograms, QQ plots, and boxplots by group
# Use facet_wrap() to show each level of explanatory separately

# Patchwork allows multi-plot arrangement
library(patchwork)

# Histogram faceted by group
hist = dataset %>% 
  ggplot(aes(x = response)) +
  geom_histogram(bins = 15) + 
  # facet_wrap(~explanatory, scales = "free_y") +
  facet_wrap(~explanatory) +
  ggtitle("Histogram of Response by Group") +
  theme_bw()

# QQ plot per group
qq = dataset %>%
  ggplot(aes(sample = response)) +
  geom_qq() +
  facet_wrap(~explanatory) +
  ggtitle("QQ Plots of Response by Group") +
  theme_bw()

# Boxplot per group
box = dataset %>% 
  ggplot(aes(y = response, x = explanatory)) +
  geom_boxplot() +
  ggtitle("Boxplot of Response by Group") +
  theme_bw()

# Combine plots using patchwork (arrange histogram above qqplot, next to boxplot)
(hist / qq) | box

3. Permutation Test: Generate Permutation Distribution for Group Mean Difference

Code
# Step 1: Calculate the observed difference in sample means
xbars = dataset %>% group_by(explanatory) %>% summarize(mean = mean(response))
xbarGrp1minusGrp2 = xbars[2,2] - xbars[1,2] # observed difference
xbarGrp1minusGrp2

# Step 2: Build the permutation distribution under the null with this loop
# Make sure nGrp1 and nGrp2 are defined as sample sizes of the groups
xbarDiffHolder = numeric(10000)

for (i in 1:10000){
  scrambledLabels = sample(dataset$explanatory, nGrp1+nGrp2); # shuffle labels
  
  datasetTemp = dataset
  datasetTemp$explanatory = scrambledLabels
  
  xbars = datasetTemp %>% group_by(explanatory) %>% summarize(mean = mean(response))
  xbarGrp1minusGrp2 = xbars[2,2] - xbars[1,2] # observed difference
  xbarGrp1minusGrp2
  xbarDiffHolder[i] = xbarGrp1minusGrp2$mean
}

# Step 3: Plot the permutation distribution
df = data.frame(xbarDiffs = xbarDiffHolder)

df %>% ggplot(mapping = aes(x = xbarDiffs)) + 
  geom_histogram(bins = 25, fill = "cornflowerblue", linewidth = 0.1) +
  ggtitle("Permutation Distribution of the Difference of Sample Means") +
  xlab("xbarGrp1 - xbarGrp2")

# Step 4: Calculate the p-value (two-tailed)
num_more_extreme = sum((abs(xbarDiffHolder)) >= abs(xbarGrp1minusGrp2))

pvalue = num_more_extreme / 10000
pvalue

4. t-Tests and Power Analysis

4A. Basic \(t\)-Tests

Code
# Critical value for a two-sided t-test (95% CI)
# Use df = n - 1 (one-sample) or df = n1 + n2 - 2 (two-sample)
qt(0.975, df - 2)

# One-sample t-test
t.test(x=dataset, mu = underTheNull, conf.int = "TRUE", alternative = "two.sided")

# Paired t-test (one-sample, df-1, within-subject comparison)
t.test(x = dataset$explantoryGrp2, y = dataset$explantoryGrp1, paired = TRUE) # this is probably easiest
# Or alternative formula syntax
t.test(response ~ explanatory, data = dataset, paired = TRUE) # alternative = "two-sided", mu = 0, conf.level = 0.95, var.equal(doesn't apply, one-sample)
# Check the order of the treatment groups
levels(dataset$explanatory)

# Two-sample t-test with equal variance
results = t.test(response ~ explanatory, data = dataset, var.equal = TRUE, alternative = "two.sided")
results

# Two-sample t-test with unequal variance (Welch's)
results = t.test(response ~ explanatory, data = dataset, var.equal = FALSE, alternative = "two.sided")
results

# Extract test statistic and degrees of freedom
results = t.test(response ~ explanatory, data = dataset)
tstat = results$statistic
df = results$parameter

# Manual two-sided p-value
pt(abs(tstat), df,lower.tail = FALSE) * 2

4B. Power Analysis

Code
# Compute the power of a t-test (one-sample or two-sample)
power.t.test(n = nPerGrp, delta = effectSize, sd = sd, sig.level = alpha, power = NULL, type = "one.sample", alternative = "one.sided")
# or "two.sample", can also just leave power (or unknown variable off)

# Find sample size to get 80% power (leave n blank)
power.t.test(n = , delta = effectSize, power = .8, sd = s, sig.level = .05, type = "one.sample", alternative = "one.sided")

# Unequal sample sizes using Cohen’s d (if you have different sample sizes in each of two samples)
library(pwr)
pwr.t2n.test(n1 = n1, n2 = n2, d = effectSize/stdev, sig.level = 0.05, alternative = "two.sided") # alternative = "greater"

# Power calculation with unequal standard deviations (Welch’s)
power.welch.t.test(n = n, delta = effectSize, power = , sd1 = s1, sd2 = s2, alternative = "two-sided")

4C. Power Curve

Code
# Create power curve across a range of sample sizes
samplesizes = seq(nlower, nhigher, by = 1)
powerholder = numeric(length(samplesizes))

for(i in 1:length(samplesizes))
{
  powerholder[i] = power.t.test(n = samplesizes[i], delta = effectSize, sd = s, sig.level = .05, type = "one.sample", alternative = "one.sided")$power
}

# Combine into a data frame
powerdf = data.frame(samplesizes, powerholder)

# Plot power vs sample size to create the power curve
powerdf %>% ggplot(aes(x = samplesizes, y = powerholder)) +
  geom_line(color = "blue3", linewidth = 1.5) +
  ggtitle("Power Curve") +
  ylab("Power") +
  xlab("Sample Sizes") +
  ylim(0.75, 1.0) +
  theme_bw()

5. One-Way ANOVA and Extra Sum of Squares

5A. One-Way ANOVA

Code
# Fit a one-way ANOVA model (make sure groups is a factor variable)
fit = aov(response ~ groups, data = dataset)
summary(fit)

# Find the critical value for the F-distribution (one-sided test)
# critical_value = qf(alpha, dfn, dfd, lower.tail = FALSE)
qf(0.05, dfn, dfd, lower.tail = FALSE)

5B. Extra Sum of Squares Test

Code
# Compare a full model to a reduced model using extra sum of squares
# Example: comparing CTRL and D and testing whether they can be combined

# To confirm at lease two groups are different: one-way ANOVA first (make sure groups is a factor variable)
fit = aov(response ~ groups, data = dataset)
summary(fit)

# Full model: all group levels (e.g., CTRL, A, B, C, D, E)
fit_full = aov(response ~ explanatory, data = dataset)
sum_fit_full = summary(fit_full)
sum_fit_full
# Extract the degrees of freedom for the error
dfd = sum_fit_full[[1]]["Residuals", "Df"]
# Extract the sum of squares for the error
ssFull = sum_fit_full[[1]]["Residuals", "Sum Sq"]

# Reduced model: O, O, A, B, C, E (combine CTRL & D)
fit_reduce = aov(response ~ explanatoryReduced, data = dataset)
sum_fit_reduce = summary(fit_reduce)
sum_fit_reduce
# Extract the degrees of freedom for the error
dfTotal = sum_fit_reduce[[1]]["Residuals", "Df"]
# Extract the sum of squares for the error
ssRed = sum_fit_reduce[[1]]["Residuals", "Sum Sq"]

#F-Statistic and P-Value for Model Comparison
# BYOA table calculations (F-test to compare reduced vs full model)
alpha = 0.05
dfn = dfTotal - dfd                     # numerator df
ssModel = ssRed - ssFull                # difference in SS
mse = ssFull / dfd                      # mean square error
msModel = ssModel / dfn                 # mean square model
fstat = msModel / mse                   # F-statistic
fstat

# P-value (one-tailed test)
p_value = pf(fstat, dfn, dfd, lower.tail = FALSE)
p_value

# Critical F value
critical_value = qf(alpha, dfn, dfd, lower.tail = FALSE)
critical_value

# Print results
cat("alpha:", alpha, "\n")
cat("dfTotal (reduced):", dfTotal, "\n")
cat("dfd (full):", dfd, "\n")
cat("dfn:", dfn, "\n")
cat("Sum of Squares for Reduced Model:", ssRed, "\n")
cat("Sum of Squares for Error (Full):", ssFull, "\n")
cat("Sum of Squares for Model (SS explained):", ssModel, "\n")
cat("Mean Square Error:", mse, "\n")
cat("Mean Square Model (MS explained):", msModel, "\n")
cat("Critical Value:", critical_value, "\n")
cat("F-Statistic:", fstat, "\n")
cat("p-Value F-test:", p_value, "\n")

6. Multiple Comparisons

6A. Tukey HSD using multcomp (Tukey-Kramer Adjustment)

Code
# Use the multcomp package to conduct pairwise comparisons with the Tukey-Kramer multiple comparison correction
library(multcomp)

# Fit must come from an ANOVA or linear model with a factor variable
gfit = glht(fit, linfct = mcp(groups = "Tukey")) # 'groups' is your factor variable

# See pairwise comparisons and adjusted p-values
summary(gfit)

# Extract confidence intervals for all pairwise comparisons
confint_gfit = confint(gfit)
confint_gfit

6B. Tukey HSD using agricolae

Code
# Tukey HSD using the agricolae package
# Different way to extract half_width
library(agricolae)

# Run Tukey HSD test (grouping variable must be specified)
tukey_half = HSD.test(fit, 'groups') # put groups variable in

# Extract minimum significant difference (half-width of CI)
tukey_half$statistics$MSD

6C. Tukey HSD using Base R

Code
# Alternative method using base R function TukeyHSD()
# Note: This function works correctly on models fit with aov()
# (I haven't used this before, so confirm it is the correct function.)

# Provides adjusted CIs and p-values
tukey_result = TukeyHSD(fit)
tukey_result

7. Contrasts

Perform planned contrasts only after a significant ANOVA result. Use adjusted p-values when doing multiple, unplanned comparisons.

7A. Setup and Specifying Contrasts

Code
# Contrast of the average of the mean of 2 groups of 2

# Load emmeans for estimated marginal means (least squares means) and contrasts
library(emmeans)

# Check the order of factor levels - for assigning contrast coefficients
unique(dataset$groups)

# Fit linear model with categorical explanatory variable
fit = lm(response ~ groups, data = dataset)

# Get least-squares means for each group
leastsquare = emmeans(fit, "groups") # put groups variable in

# Define contrast weights: compare (A+B) vs (C+D)
contrasts = list(Grp1and2vsGrp3and4 = c(.5, .5, -.5, -.5))

7B. Run Contrast Tests

Code
# With adjustment
contrastResultsCorr = contrast(leastsquare, contrasts, adjust = "sidak") # slightly less conservative than bonferroni
sum_contrastResultsCorr = summary(contrastResultsCorr)
sum_contrastResultsCorr
# Confidence interval
confint(contrastResultsCorr, level = 0.95) 

# Without adjustment
contrastResults = contrast(leastsquare, contrasts) # no adjustment
sum_contrastResults = summary(contrastResults)
sum_contrastResults
# Confidence interval
confint(contrastResults, level = 0.95) 

7C. Manual Confidence Interval from Estimate

Code
# Find critical value for 95% CI (df from ANOVA)
criticalVal = qt(0.975, df)
criticalVal

# CI = estimate ± criticalValue*SE (estimate is the difference of means from the original t-test)
# Estimate ± margin of error
estimate = sum_contrastResultsCorr$estimate
SE = sum_contrastResultsCorr$SE

CI_lower = estimate - criticalVal * SE
CI_upper = estimate + criticalVal * SE

CI_lower
CI_upper

8. Non-Parametric Tests

8A. Wilcoxon Rank-Sum Test (Mann-Whitney)

Code
# Compare two independent samples

# Get the EXACT p-value (one-sided test)
wilcox.test(response ~ explanatory, data = dataset, alternative = "less", exact = TRUE)
# Get the exact matching CI (note it is not alpha, it is conf.level)
# ‘conf.int’ option provides the HL confidence limits (like SAS)
wilcox.test(response ~ explanatory, data = dataset, alternative = "two.sided", exact = TRUE, conf.level = 0.90, conf.int = TRUE)

# Get the NORMAL APPROXIMATION p-value (with continuity correction)
wilcox.test(response ~ explanatory, data = dataset, alternative = "less", exact = FALSE, correct = TRUE)
# Get the normal approximation matching CI
wilcox.test(response ~ explanatory, data = dataset, alternative = "two.sided", exact = FALSE, correct = TRUE, conf.level = 0.90, conf.int = TRUE)

8B. Signed-Rank Test (Paired Samples)

Code
# Get critical Z value for one-sided test
critVal = qnorm(0.95)
critVal

# Run the paired test
signedRank = wilcox.test(dataset$before, dataset$after, paired = TRUE, alternative = "greater", exact = FALSE, correct = TRUE)
signedRank

# Get the 90% matching CI
signedRankCI = wilcox.test(dataset$before, dataset$after, paired = TRUE,alternative = "two.sided", exact = FALSE, correct = TRUE, conf.level = 0.90, conf.int = TRUE)
signedRankCI

8C. Manual Z-Approximation for Signed-Rank

Code
# This is extra code for the by-hand calculations.
# Extract the test statistic (S)
S = signedRank$statistic
S

# Calculate sample size (n)
n = length(dataset$before)
n

# Calculate the expected value (mean) of S under the null hypothesis
mean_S = n * (n + 1) / 4
mean_S

# Calculate the standard deviation of S under the null hypothesis
sd_S = sqrt(n * (n + 1) * (2 * n + 1) / 24)
sd_S

# Calculate the Z-statistic with continuity correction
CC = ifelse(S > meanS, -0.5, 0.5)
CC
Z = (S + CC - mean_S) / sd_S
Z

# Get the p-value for a one-tailed test (upper tail)
p_value_one_tailed = 1 - pnorm(Z)
p_value_one_tailed

9. Correlation and Simple Linear Regression

9A. Pearson Correlation

Code
# Create a sample dataset
dataset = data.frame(response = c(1, 2, 3, 4, 5), 
                     explanatory = c(1, 2, 3, 4, 5))

# Make a scatter plot to visualize the relationship
plot(dataset$explanatory, dataset$response, 
     xlab = "Explanatory", ylab = "Response", 
     main = "Scatterplot of Response vs. Explanatory", pch = 15)

# Alternatively, without making a dataframe
plot(response, explanatory)

# Get Pearson correlation coefficient (three ways)
cor(dataset) # returns correlation matrix
cor(dataset$response, dataset$explanatory)
cor(x = explanatory, y = response)

# Hypothesis test for correlation (t-test)
# Get r, the test statistic, p-value, and confidence interval
cor.test(dataset$response, dataset$explanatory)

# Get the critical value
qt(0.975,n-2) # df = n-2, for 95% two-sided test

9B. Simple Linear Regression: Fitting and Diagnostics

Code
# Fit the linear model
fit = lm(response ~ explanatory, data = dataset)
summary(fit)  # Shows coefficients, t-tests, R-squared

# Plot data and fitted line (base R)
plot(dataset$explanatory, dataset$response, 
     xlab = "Explanatory", ylab = "Response", 
     main = "Linear Regression", pch = 15)
lines(dataset$explanatory, fit$fitted.values, col = "blue")

# ANOVA for regression model
anova(fit)


# --- Residual Diagnostics ---

# Default residual plots (base R)
plot(fit)  # Residuals vs Fitted, QQ Plot, etc.

# Save residuals and fitted values for custom plots
dataset$residuals = residuals(fit)
dataset$fittedVals = fitted(fit)

# ggplot2 residuals vs fitted plot
dataset %>% 
  ggplot(aes(x = fittedVals, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  ggtitle("Residuals vs Fitted Values") +
  theme_bw()

# QQ plot of residuals (base R)
residuals = residuals(fit)
qqnorm(residuals)
qqline(residuals, col = "black")

9C. Confidence & Prediction Intervals

Code
# --- Confidence Intervals for Coefficients ---

fit = lm(response ~ explanatory, data = dataset)

# Show summary
summary(fit)

# Confidence interval for the parameter estimates (intercept and slope)
confint(fit)

# CI for just the slope
confint(fit, "explanatory")

# CI at different confidence level
confint(fit, level = 0.99) # to change alpha


# --- Confidence & Prediction for New Observations ---

# Add a new data point
new_data = data.frame(response = NA, explanatory = 2.5)
# or
new_data = data.frame(explanatory = 2.5)

# Confidence interval (mean response at x)
# 95% CI estimating the mean value of y expected at value of x
predictionCI = predict(fit, newdata = new_data, interval = "confidence")
predictionCI

# Prediction interval (individual response at x)
# 95% CI estimating the individual value of y expected at value of x, more error
predictionPI = predict(fit, newdata = new_data, interval = "prediction")
predictionPI


# --- Plot CI and PI Around Fitted Line ---

# Predicted values + intervals (for the full dataset)
predictions = predict(fit, interval = "confidence", level = 0.95, se.fit = TRUE)
prediction_intervals = predict(fit, interval = "prediction", level = 0.95)

# Add intervals to original data for plotting
movies = movies %>%
  mutate(fit = predictions$fit,
         lwr_conf = predictions$fit[, "lwr"],
         upr_conf = predictions$fit[, "upr"],
         lwr_pred = prediction_intervals[, "lwr"],
         upr_pred = prediction_intervals[, "upr"])

# Plot fitted line with CI and PI ribbons
ggplot(data, aes(x = explanatory, y = response)) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_ribbon(aes(ymin = lwr_conf, ymax = upr_conf), alpha = 0.5, fill = "lightblue") +
  geom_line(aes(y = lwr_pred), linetype = "dashed", color = "black") +
  geom_line(aes(y = upr_pred), linetype = "dashed", color = "black") +
  geom_hline(yintercept = 210, color = "darkred") +
  geom_point() +
  labs(title = "Regression Line with 95% Confidence and Prediction Intervals",
       x = "Explanatory", y = "Response") +
  theme_bw()


# --- Calibration Intervals (Reverse Prediction) ---

# For estimating values of x from given y
library(investr)

# Calibration interval for the mean budget (Estimate x for a given y0, mean response)
calibrate(fit,y0 = 210, interval = "Wald", mean.response = TRUE, limit = FALSE)

# Calibration interval for a single movie budget (estimate x for an individual response)
calibrate(fit,y0 = 210, interval = "Wald", mean.response = FALSE, limit = FALSE)


# --- R-squared Interpretation ---
# R-squared = measure of the proportion of variation in the response that is accounted for by the explanatory variable

# Get the summary of the model
summary_fit = summary(fit)

# Extract R-squared
R_squared = summary_fit$r.squared

# Interpretation
cat("The R-squared value is", R_squared, "\n")
cat("This means that", round(R_squared * 100, 2), "% of the variability in Response is explained by Explanatory.")

10. Residual Analysis

10A. Fit Model and Add Residuals

Code
# Fit a linear model
fit = lm(response ~ explanatory, data = dataset)

# Log transform the data if needed
dataset = dataset %>% mutate(
  log_explanatory = log(explanatory),
  log_response = log(response)
)

# Add residuals and fitted values to the dataframe
dataset$residuals = residuals(fit)
dataset$fittedVals = fitted(fit)

# Studentized residuals (internal)
library(car)  # for rstudent(), studentized residuals
dataset$studentized_residuals = rstudent(fit) 

# Another way to get studentized residuals (studentized deleted residuals)
# External: excluding the data point being tested. More accurate for identifying outliers or influential points, because the data point doesn't bias its own standard error.
library(MASS)
# Calculate studentized residuals (external studentized: ti=ei/σhati*sqrt(1−hii))
dataset$StudentizedResiduals = rstudent(fit)

10B. Residual Plots

Code
# Base R: full residual diagnostics (residuals vs fitted, QQ, sqrt(std residuals), Cook’s)
plot(fit)

# ggplot: residuals vs fitted scatterplot
ggplot(data = dataset, aes(x = fittedVals, y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "blue") +
  labs(x = "Fitted Values", y = "Residuals", title = "Scatterplot of Residuals") + 
  theme_bw()

10C. Normality Checks for Residuals

Code
# Base R: QQ plot of residuals
qqnorm(residuals(fit), main = "QQ Plot of Residuals")
qqline(residuals(fit), col = "blue")

# or using residuals saved to dataset
qqnorm(dataset$residuals, main = "QQ Plot of Residuals")
qqline(dataset$residuals, col = "blue")

# car package: enhanced QQ plot
library(car)
qqPlot(dataset$residuals, main = "QQ Plot (car::qqPlot)")

# ggplot2 QQ plot
ggplot(dataset, aes(sample = residuals)) +
  stat_qq() +
  stat_qq_line(col = "blue") +
  labs(title = "QQ Plot of Residuals", x = "Theoretical Quantiles", y = "Sample Quantiles") + 
  theme_bw()

10D. Histogram of Residuals

Code
# ggplot2 histogram with normal curve
ggplot(data = dataset, aes(x = residuals)) +
  geom_histogram(aes(y = after_stat(density)), bins = 15, fill = "lightblue", color = "gray30") +
  stat_function(fun = dnorm, args = list(mean = mean(dataset$residuals), sd = sd(dataset$residuals)), color = "blue") +
  labs(x = "Residuals", y = "Density", title = "Histogram of Residuals with Normal Curve") + 
  theme_bw()

# Base R version with overlaid normal curve
hist(residuals(fit), breaks = 15, probability = TRUE, col = "lightblue", border = "gray30", main = "Histogram of Residuals with Normal Curve", xlab = "Residuals")
curve(dnorm(x, mean = mean(residuals(fit)), sd = sd(residuals(fit))), col = "blue", lwd = 2, add = TRUE)

10E. Regression Line with Confidence & Prediction Intervals

Code
# Base R scatterplot and regression line
plot(dataset$explanatory, dataset$response, 
     xlab = "Explanatory", ylab = "Response", 
     main = "Linear Regression of Response & Explanatory", pch = 16)
# Add the regression line
abline(fit, col = "blue", lwd = 2)

# ggplot2 scatterplot + regression line
ggplot(dataset, aes(x = explanatory, y = response)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(x = "Explanatory", y = "Response", 
       title = "Linear Regression of Response & Explanatory") +
  theme_bw()


# --- Optional: Add Intervals and Highlighted Points ---

# Identify specific data points to highlight
dataToHighlight = dataset %>%
  filter(columnName == "Value") %>% 
  select("col1", "col2", "col3")
dataToHighlight

# Build prediction data frame (example using log-log model structure)
new_data = data.frame(logGDP = InfantVGDP_clean$logGDP, logInfantMort = InfantVGDP_clean$logInfantMort)

# Generate confidence and prediction intervals using log-log model fit
confInt = predict(fitLogLog, newdata = new_data, interval = "confidence")
predInt = predict(fitLogLog, newdata = new_data, interval = "predict")

# Add intervals to new_data
new_data$fit = confInt[, "fit"]
new_data$lwr_conf = confInt[, "lwr"]
new_data$upr_conf = confInt[, "upr"]
new_data$lwr_pred = predInt[, "lwr"]
new_data$upr_pred = predInt[, "upr"]

# Plot CI and PI with highlighted point(s)
ggplot(dataset, aes(x = explanatory, y = response)) +
  # Prediction interval ribbon (widest)
  geom_ribbon(data = new_data, aes(ymin = lwr_pred, ymax = upr_pred), alpha = 0.3, fill = "gray70") +
  # Confidence interval ribbon (narrower)
  geom_ribbon(data = new_data, aes(ymin = lwr_conf, ymax = upr_conf), alpha = 0.6, fill = "lightblue") +
  # Raw data points
  geom_point() +
  # Highlighted point(s)
  geom_point(data = dataToHighlight, aes(x = explanatory, y = response), color = "red3", size = 4, stroke = 1.25, shape = 21) +
  # Regression line
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(x = "Explanatory", y = "Response", 
       title = "Linear Regression with 95% CI, PI, and Highlighted Points") +
  theme_bw()

11. Log-Log Models & Back-Transformation

11A. Back-Transform the Slope and Interpret

Code
# Extract slope from log-log model
intercept = coef(fitLogLog)[1]
slope = coef(fitLogLog)[2]

# Get confidence interval for slope
conf_intervals = confint(fit)
slope_conf_lower = conf_intervals["log_explanatory", 1]
slope_conf_upper = conf_intervals["log_explanatory", 2]

# Back-transform the slope and its CI
back_transformed_slope = 2^slope
back_transformed_conf_lower = 2^slope_conf_lower
back_transformed_conf_upper = 2^slope_conf_upper

# Percentage change interpretation
percentage_change = (1 - back_transformed_slope) * 100
percentage_change_conf_lower = (1 - back_transformed_conf_lower) * 100
percentage_change_conf_upper = (1 - back_transformed_conf_upper) * 100

# Display results
cat("Slope:", slope, "\n")
cat("Back-transformed Slope (2^b1):", back_transformed_slope, "\n")
cat("Percentage Change:", percentage_change, "%\n")
cat("CI for Slope: (", slope_conf_lower, ", ", slope_conf_upper, ")\n")
cat("Back-transformed CI for Slope: (", back_transformed_conf_lower, ", ", back_transformed_conf_upper, ")\n")
cat("Percentage Change CI: (", percentage_change_conf_lower, "%, ", percentage_change_conf_upper, "%)\n")

11B. Back-Transform the Intercept

Code
# Get CI for intercept
intercept_conf_lower = conf_intervals["(Intercept)", 1]
intercept_conf_upper = conf_intervals["(Intercept)", 2]

# Back-transform using exponential
back_transformed_intercept = exp(intercept)
back_transformed_intercept_conf_lower = exp(intercept_conf_lower)
back_transformed_intercept_conf_upper = exp(intercept_conf_upper)

# Display results
cat("Intercept:", intercept, "\n")
cat("Back-transformed Intercept (exp(b0)):", back_transformed_intercept, "\n")
cat("CI for Intercept: (", intercept_conf_lower, ", ", intercept_conf_upper, ")\n")
cat("Back-transformed CI: (", back_transformed_intercept_conf_lower, ", ", back_transformed_intercept_conf_upper, ")\n\n")

11C. Compare Predicted vs Observed Values

Code
# Estimate log(Y) and then back-transform to original scale
est_log_response = intercept + slope * dataset$explanatory
est_response = exp(est_log_response)

# Display for comparison
cat("Estimated log(response):", est_log_response, "\n")
cat("Estimated response:", est_response, "\n")
cat("Actual log(response):", dataset$obs_log_response, "\n")
cat("Actual response:", dataset$obs_response, "\n")

11D. Lack of Fit Test

Code
# Extra sum of squares F-test (lack of fit)
# H0: Linear regression fits well (no lack of fit)
# HA: Separate means model fits better (LRM is lacking fit)

# Critical value
qf(1-alpha, dfn, dfd)

# Find p-value for f-distribution
pf(fstat, dfn, dfd, lower.tail = FALSE)

# Template interpretation:
# "There is [overwhelming/sufficient/insufficient] evidence at the alpha = 0.05 level of significance to suggest the linear regression model has a lack of fit compared to the separate-means model (p- value = XYZ from an extra-sum-of-squares F-test)."

12. Multiple Linear Regression (MLR)

12A. MLR with Only Numeric Predictors (Same Slopes)

Code
# Subset to just the numerical columns of interest
subset_scores = scores[ , c("science", "math", "read")]

# Visualize relationships: matrix scatterplot
plot(subset_scores,
     main = "Matrix Scatterplot", 
     pch = 19, # point character
     col = "darkblue")

# Use GGally:GGpairs to make a matrix scatterplot
library(GGally)
ggpairs(subset_scores,
        title = "Matrix Scatterplot")

# Fit the multiple linear regression model
fit = lm(science ~ math + read, data = scores)
summary(fit)      # Includes t-tests, R², etc.
confint(fit)      # Confidence intervals for coefficients

# Plot residuals to check assumptions (normality, linearity, equal variance)
plot(fit)

12B. MLR with Categorical Predictors (Indicator Variables, Same Slopes)

Code
# Visualize response by numeric and categorical explanatory variables
dataset %>% 
  ggplot(aes(x = explanatoryNum, y = response, color = explanatoryCat)) +
  geom_point() +
  labs(title = "Response vs Numeric and Categorical Predictors",
       x = "Explanatory (Numeric)",
       y = "Response",
       color = "Explanatory (Category)") +
  theme_bw()

# Convert category variable to a factor if needed
scores$ses = as.factor(scores$ses)

# Fit the model, set reference level if necessary
fit = lm(response ~ explanatoryNum + relevel(explanatoryCat, ref = "3"), data = dataset)
summary(fit)
confint(fit)

# View the covariance matrix (advanced)
vcov(fit)

# Check assumptions with residual plots
plot(fit)

12C. MLR with Interaction (Different Slopes per Group)

Code
# Include interaction term between numeric and categorical variables
fit = lm(response ~ explanatoryNum * relevel(factor(explanatoryCat), ref = "3"), data = dataset) # can convert to factor here instead
summary(fit)
confint(fit)

# Equivalent fully expanded form
fit2 = lm(response ~ 
            explanatoryNum + 
            relevel(factor(explanatoryCat), ref = "3") + 
            explanatoryNum*relevel(factor(explanatoryCat), ref = "3"), 
          data = dataset)
summary(fit2)
confint(fit2)

# Plot residuals to check model fit
plot(fit)

13. Variable Selection

13A. Stepwise Selection by \(p\)-Value

Code
# Use olsrr for stepwise selection based on p-values (significance level)
# install.packages("olsrr")
library(olsrr)

# Full model with all predictors
fit = lm(response ~ ., data = dataset)

# Forward selection (add variables one at a time)
a = ols_step_forward_p(fit, p_val = 0.15, details = TRUE)

# Backward elimination (start with full model, remove variables)
b = ols_step_backward_p(fit, p_val = 0.15, details = TRUE) # bigger p-remove, more variables in the model; smaller p-remove, fewer

# Stepwise (both directions)
c = ols_step_both_p(fit, p_enter = 0.15, p_remove = 0.15, details = TRUE)

13B. Forward Selection by AIC

Code
# Refit full model
fit = lm(response ~ ., data = dataset)

# Forward selection using AIC as criterion
d = ols_step_forward_aic(fit, details = TRUE)

13C. Cross-Validation and Interaction Terms

Code
# Fit model with all two-way interactions
fit = lm(response ~ .^2, data = dataset)

# Use caret for model tuning with LOOCV
library(caret)

# Define training control method
train_control = trainControl(method="LOOCV")

# Train linear model with specified predictors
model = train(response ~ explanatory1 + explanatory2 + explanatory3 + explanatory4 + explanatory5, data = dataset, trControl = train_control, method = "lm")

# Output cross-validated model results
model