# ========================================== # Chapter 11: Simple Linear Regression & Correlation (R / RStudio) # Dataset: dataset_study.csv # X = study_hours, Y = grade # ========================================== # 0) Setup rm(list = ls()) options(scipen = 999) library(ggplot2) # 1) Load data (adjust path if needed) df <- read.csv("C:/Users/Klajd Koskija/Downloads/dataset_study.csv") # Quick check str(df) summary(df) # Ensure numeric df$study_hours <- as.numeric(df$study_hours) df$grade <- as.numeric(df$grade) # Remove missing rows (if any) df <- na.omit(df) n <- nrow(df) cat("n =", n, "\n") # 2) Basic sample quantities X <- df$study_hours Y <- df$grade Xbar <- mean(X) Ybar <- mean(Y) Sxx <- sum( (X - Xbar)^2 ) Sxy <- sum( (X - Xbar) * (Y - Ybar) ) Syy <- sum( (Y - Ybar)^2 ) cat("\n--- Sample quantities ---\n") cat("Xbar =", Xbar, "\n") cat("Ybar =", Ybar, "\n") cat("Sxx =", Sxx, "\n") cat("Sxy =", Sxy, "\n") cat("Syy =", Syy, "\n") # 3) Least-squares estimates (manual) b1 <- Sxy / Sxx b0 <- Ybar - b1 * Xbar cat("\n--- Least squares (manual) ---\n") cat("b1 (slope) =", b1, "\n") cat("b0 (intercept) =", b0, "\n") cat("Fitted line: Yhat = ", b0, " + ", b1, " * X\n", sep="") # Fitted values and residuals Yhat <- b0 + b1 * X e <- Y - Yhat # 4) SSE, MSE, sigma-hat SSE <- sum(e^2) df_res <- n - 2 MSE <- SSE / df_res sigma_hat <- sqrt(MSE) cat("\n--- Error sums ---\n") cat("SSE =", SSE, "\n") cat("df =", df_res, "\n") cat("MSE =", MSE, "\n") cat("sigma_hat =", sigma_hat, "\n") # 5) Standard errors of b1 and b0 SE_b1 <- sqrt(MSE / Sxx) SE_b0 <- sqrt(MSE * (1/n + (Xbar^2)/Sxx)) cat("\n--- Standard errors ---\n") cat("SE(b1) =", SE_b1, "\n") cat("SE(b0) =", SE_b0, "\n") # 6) Hypothesis test for slope: H0: beta1 = 0 t_b1 <- b1 / SE_b1 p_b1 <- 2 * pt(-abs(t_b1), df = df_res) cat("\n--- Slope test ---\n") cat("t =", t_b1, "\n") cat("p-value =", p_b1, "\n") # 7) 95% confidence intervals for b1 and b0 alpha <- 0.05 tcrit <- qt(1 - alpha/2, df = df_res) CI_b1 <- c(b1 - tcrit * SE_b1, b1 + tcrit * SE_b1) CI_b0 <- c(b0 - tcrit * SE_b0, b0 + tcrit * SE_b0) cat("\n--- 95% CIs ---\n") cat("tcrit =", tcrit, "\n") cat("CI beta1 =", CI_b1[1], "to", CI_b1[2], "\n") cat("CI beta0 =", CI_b0[1], "to", CI_b0[2], "\n") # 8) Mean-response CI and Prediction interval at chosen x0 values mean_CI <- function(x0){ y0hat <- b0 + b1 * x0 se_mean <- sqrt(MSE * (1/n + (x0 - Xbar)^2 / Sxx)) c(y0hat - tcrit*se_mean, y0hat + tcrit*se_mean) } pred_PI <- function(x0){ y0hat <- b0 + b1 * x0 se_pred <- sqrt(MSE * (1 + 1/n + (x0 - Xbar)^2 / Sxx)) c(y0hat - tcrit*se_pred, y0hat + tcrit*se_pred) } # Example points: mean X, and 25th/75th percentile of X x0_list <- c(Xbar, quantile(X, 0.25), quantile(X, 0.75)) names(x0_list) <- c("x0_mean", "x0_Q1", "x0_Q3") cat("\n--- Mean CI vs Prediction PI (95%) ---\n") for (nm in names(x0_list)){ x0 <- x0_list[nm] y0hat <- b0 + b1*x0 ci <- mean_CI(x0) pi <- pred_PI(x0) cat("\n", nm, ": x0 =", x0, "\n", sep="") cat(" yhat =", y0hat, "\n") cat(" Mean CI =", ci[1], "to", ci[2], "\n") cat(" Pred PI =", pi[1], "to", pi[2], "\n") } # 9) Correlation, test, Fisher CI r <- cor(X, Y) t_r <- r * sqrt(df_res) / sqrt(1 - r^2) p_r <- 2 * pt(-abs(t_r), df = df_res) cat("\n--- Correlation ---\n") cat("r =", r, "\n") cat("t =", t_r, "\n") cat("p-value =", p_r, "\n") # Fisher Z CI for rho z <- 0.5 * log((1 + r)/(1 - r)) SEz <- 1 / sqrt(n - 3) z_lo <- z - qnorm(1 - alpha/2) * SEz z_hi <- z + qnorm(1 - alpha/2) * SEz rho_lo <- (exp(2*z_lo) - 1) / (exp(2*z_lo) + 1) rho_hi <- (exp(2*z_hi) - 1) / (exp(2*z_hi) + 1) cat("95% CI for rho =", rho_lo, "to", rho_hi, "\n") # 10) R^2, SST, SSR, SSE SST <- Syy SSR <- SST - SSE R2 <- 1 - SSE/SST cat("\n--- R-squared decomposition ---\n") cat("SST =", SST, "\n") cat("SSR =", SSR, "\n") cat("SSE =", SSE, "\n") cat("R^2 =", R2, "\n") # 11) Verify with built-in lm() fit <- lm(grade ~ study_hours, data = df) cat("\n--- lm() summary ---\n") print(summary(fit)) # 12) Plots # Scatterplot + fitted line p1 <- ggplot(df, aes(x = study_hours, y = grade)) + geom_point(alpha = 0.3) + geom_smooth(method = "lm", se = TRUE) + labs(title = "Scatterplot of Grade vs Study Hours with Regression Line", x = "Study Hours (X)", y = "Grade (Y)") print(p1) # Residuals vs fitted df$fit <- fitted(fit) df$res <- resid(fit) p2 <- ggplot(df, aes(x = fit, y = res)) + geom_point(alpha = 0.3) + geom_hline(yintercept = 0) + labs(title = "Residuals vs Fitted Values", x = "Fitted values", y = "Residuals") print(p2) # Normal Q-Q plot of residuals (base R) qqnorm(df$res, main = "Normal Q-Q Plot of Residuals") qqline(df$res)