# ============================================================ # Chapter 11 Graphs (R / RStudio) — dataset_study.csv # X = study_hours, Y = grade # Produces ALL recommended graphs + saves PNGs for Word # ============================================================ rm(list = ls()) options(scipen = 999) # Packages library(ggplot2) # 1) Load data (adjust path if needed) df <- read.csv("C:/Users/Klajd Koskija/Downloads/dataset_study.csv") df$study_hours <- as.numeric(df$study_hours) df$grade <- as.numeric(df$grade) df <- na.omit(df) # Quick check stopifnot(all(c("study_hours","grade") %in% names(df))) n <- nrow(df) cat("Rows:", n, "\n") # 2) Fit the model fit <- lm(grade ~ study_hours, data = df) df$yhat <- fitted(fit) df$res <- resid(fit) # Useful summary numbers X <- df$study_hours Y <- df$grade Xbar <- mean(X) Sxx <- sum((X - Xbar)^2) SSE <- sum(df$res^2) df_res <- n - 2 MSE <- SSE / df_res tcrit <- qt(0.975, df = df_res) cat("\nModel:\n") print(summary(fit)) # 3) Create output folder for Word figures out_dir <- "chapter11_figures" if (!dir.exists(out_dir)) dir.create(out_dir) # Helper to save ggplot nicely save_plot <- function(p, filename, w=7.5, h=5.2, dpi=300){ ggsave(file.path(out_dir, filename), plot = p, width = w, height = h, dpi = dpi) } # ============================================================ # FIGURE 1: Scatterplot (foundation) # ============================================================ p1 <- ggplot(df, aes(x = study_hours, y = grade)) + geom_point(alpha = 0.25) + labs( title = "Figure 1. Scatterplot: Grade vs Study Hours", x = "Study Hours (X)", y = "Grade (Y)" ) print(p1) save_plot(p1, "Fig1_Scatterplot.png") # ============================================================ # FIGURE 2: Scatterplot + Regression Line + 95% CI band # ============================================================ p2 <- ggplot(df, aes(x = study_hours, y = grade)) + geom_point(alpha = 0.20) + geom_smooth(method = "lm", se = TRUE) + labs( title = "Figure 2. Regression Line with 95% Confidence Band", x = "Study Hours (X)", y = "Grade (Y)" ) print(p2) save_plot(p2, "Fig2_RegressionLine_CI.png") # ============================================================ # FIGURE 3: Residuals vs Fitted (assumptions check) # ============================================================ p3 <- ggplot(df, aes(x = yhat, y = res)) + geom_point(alpha = 0.25) + geom_hline(yintercept = 0, linewidth = 0.7) + labs( title = "Figure 3. Residuals vs Fitted Values", x = "Fitted Values (Ŷ)", y = "Residuals (e = Y - Ŷ)" ) print(p3) save_plot(p3, "Fig3_Residuals_vs_Fitted.png") # ============================================================ # FIGURE 4: Histogram of Residuals (normality check) # ============================================================ p4 <- ggplot(df, aes(x = res)) + geom_histogram(bins = 40) + labs( title = "Figure 4. Histogram of Residuals", x = "Residuals", y = "Count" ) print(p4) save_plot(p4, "Fig4_Residuals_Hist.png") # ============================================================ # FIGURE 5: Normal Q–Q Plot of Residuals (stronger normality check) # (Base R plot saved as PNG) # ============================================================ png(file.path(out_dir, "Fig5_QQplot_Residuals.png"), width = 1100, height = 800, res = 150) qqnorm(df$res, main = "Figure 5. Normal Q–Q Plot of Residuals") qqline(df$res, lwd = 2) dev.off() # Also display in RStudio qqnorm(df$res, main = "Figure 5. Normal Q–Q Plot of Residuals") qqline(df$res, lwd = 2) # ============================================================ # FIGURE 6: Regression line + 95% Mean CI band + 95% Prediction band # (This is the “bonus” plot lecturers LOVE) # ============================================================ # Build a grid of x values x_grid <- seq(min(df$study_hours), max(df$study_hours), length.out = 300) newd <- data.frame(study_hours = x_grid) # Predict mean response CI and prediction interval pred_mean <- predict(fit, newdata = newd, interval = "confidence", level = 0.95) pred_ind <- predict(fit, newdata = newd, interval = "prediction", level = 0.95) bands <- data.frame( study_hours = x_grid, fit = pred_mean[,"fit"], mean_lwr = pred_mean[,"lwr"], mean_upr = pred_mean[,"upr"], pred_lwr = pred_ind[,"lwr"], pred_upr = pred_ind[,"upr"] ) p6 <- ggplot() + geom_point(data = df, aes(x = study_hours, y = grade), alpha = 0.15) + geom_line(data = bands, aes(x = study_hours, y = fit), linewidth = 1) + geom_line(data = bands, aes(x = study_hours, y = mean_lwr), linetype = "dashed") + geom_line(data = bands, aes(x = study_hours, y = mean_upr), linetype = "dashed") + geom_line(data = bands, aes(x = study_hours, y = pred_lwr), linetype = "dotted") + geom_line(data = bands, aes(x = study_hours, y = pred_upr), linetype = "dotted") + labs( title = "Figure 6. Mean 95% CI (dashed) and 95% Prediction Band (dotted)", x = "Study Hours (X)", y = "Grade (Y)" ) print(p6) save_plot(p6, "Fig6_MeanCI_vs_PredictionBand.png") # ============================================================ # FIGURE 7 (Optional): Boxplot of Grades (outliers + spread) # ============================================================ p7 <- ggplot(df, aes(y = grade)) + geom_boxplot() + labs( title = "Figure 7. Boxplot of Grades", y = "Grade" ) print(p7) save_plot(p7, "Fig7_Boxplot_Grades.png") cat("\nAll figures saved in folder:", out_dir, "\n") cat("PNG files are ready to insert into Word.\n")