# ==========================================
# Chapter 11: Simple Linear Regression & Correlation (R / RStudio)
# Dataset: dataset_study.csv
# X = study_hours, Y = grade
# ==========================================

# 0) Setup
rm(list = ls())
options(scipen = 999)
library(ggplot2)

# 1) Load data (adjust path if needed)
df <- read.csv("C:/Users/Klajd Koskija/Downloads/dataset_study.csv")

# Quick check
str(df)
summary(df)

# Ensure numeric
df$study_hours <- as.numeric(df$study_hours)
df$grade      <- as.numeric(df$grade)

# Remove missing rows (if any)
df <- na.omit(df)

n <- nrow(df)
cat("n =", n, "\n")

# 2) Basic sample quantities
X <- df$study_hours
Y <- df$grade

Xbar <- mean(X)
Ybar <- mean(Y)

Sxx <- sum( (X - Xbar)^2 )
Sxy <- sum( (X - Xbar) * (Y - Ybar) )
Syy <- sum( (Y - Ybar)^2 )

cat("\n--- Sample quantities ---\n")
cat("Xbar =", Xbar, "\n")
cat("Ybar =", Ybar, "\n")
cat("Sxx  =", Sxx,  "\n")
cat("Sxy  =", Sxy,  "\n")
cat("Syy  =", Syy,  "\n")

# 3) Least-squares estimates (manual)
b1 <- Sxy / Sxx
b0 <- Ybar - b1 * Xbar

cat("\n--- Least squares (manual) ---\n")
cat("b1 (slope)     =", b1, "\n")
cat("b0 (intercept) =", b0, "\n")
cat("Fitted line: Yhat = ", b0, " + ", b1, " * X\n", sep="")

# Fitted values and residuals
Yhat <- b0 + b1 * X
e    <- Y - Yhat

# 4) SSE, MSE, sigma-hat
SSE <- sum(e^2)
df_res <- n - 2
MSE <- SSE / df_res
sigma_hat <- sqrt(MSE)

cat("\n--- Error sums ---\n")
cat("SSE =", SSE, "\n")
cat("df  =", df_res, "\n")
cat("MSE =", MSE, "\n")
cat("sigma_hat =", sigma_hat, "\n")

# 5) Standard errors of b1 and b0
SE_b1 <- sqrt(MSE / Sxx)
SE_b0 <- sqrt(MSE * (1/n + (Xbar^2)/Sxx))

cat("\n--- Standard errors ---\n")
cat("SE(b1) =", SE_b1, "\n")
cat("SE(b0) =", SE_b0, "\n")

# 6) Hypothesis test for slope: H0: beta1 = 0
t_b1 <- b1 / SE_b1
p_b1 <- 2 * pt(-abs(t_b1), df = df_res)

cat("\n--- Slope test ---\n")
cat("t =", t_b1, "\n")
cat("p-value =", p_b1, "\n")

# 7) 95% confidence intervals for b1 and b0
alpha <- 0.05
tcrit <- qt(1 - alpha/2, df = df_res)

CI_b1 <- c(b1 - tcrit * SE_b1, b1 + tcrit * SE_b1)
CI_b0 <- c(b0 - tcrit * SE_b0, b0 + tcrit * SE_b0)

cat("\n--- 95% CIs ---\n")
cat("tcrit =", tcrit, "\n")
cat("CI beta1 =", CI_b1[1], "to", CI_b1[2], "\n")
cat("CI beta0 =", CI_b0[1], "to", CI_b0[2], "\n")

# 8) Mean-response CI and Prediction interval at chosen x0 values
mean_CI <- function(x0){
  y0hat <- b0 + b1 * x0
  se_mean <- sqrt(MSE * (1/n + (x0 - Xbar)^2 / Sxx))
  c(y0hat - tcrit*se_mean, y0hat + tcrit*se_mean)
}

pred_PI <- function(x0){
  y0hat <- b0 + b1 * x0
  se_pred <- sqrt(MSE * (1 + 1/n + (x0 - Xbar)^2 / Sxx))
  c(y0hat - tcrit*se_pred, y0hat + tcrit*se_pred)
}

# Example points: mean X, and 25th/75th percentile of X
x0_list <- c(Xbar, quantile(X, 0.25), quantile(X, 0.75))
names(x0_list) <- c("x0_mean", "x0_Q1", "x0_Q3")

cat("\n--- Mean CI vs Prediction PI (95%) ---\n")
for (nm in names(x0_list)){
  x0 <- x0_list[nm]
  y0hat <- b0 + b1*x0
  ci <- mean_CI(x0)
  pi <- pred_PI(x0)
  cat("\n", nm, ": x0 =", x0, "\n", sep="")
  cat("  yhat =", y0hat, "\n")
  cat("  Mean CI =", ci[1], "to", ci[2], "\n")
  cat("  Pred PI =", pi[1], "to", pi[2], "\n")
}

# 9) Correlation, test, Fisher CI
r <- cor(X, Y)

t_r <- r * sqrt(df_res) / sqrt(1 - r^2)
p_r <- 2 * pt(-abs(t_r), df = df_res)

cat("\n--- Correlation ---\n")
cat("r =", r, "\n")
cat("t =", t_r, "\n")
cat("p-value =", p_r, "\n")

# Fisher Z CI for rho
z  <- 0.5 * log((1 + r)/(1 - r))
SEz <- 1 / sqrt(n - 3)

z_lo <- z - qnorm(1 - alpha/2) * SEz
z_hi <- z + qnorm(1 - alpha/2) * SEz

rho_lo <- (exp(2*z_lo) - 1) / (exp(2*z_lo) + 1)
rho_hi <- (exp(2*z_hi) - 1) / (exp(2*z_hi) + 1)

cat("95% CI for rho =", rho_lo, "to", rho_hi, "\n")

# 10) R^2, SST, SSR, SSE
SST <- Syy
SSR <- SST - SSE
R2 <- 1 - SSE/SST

cat("\n--- R-squared decomposition ---\n")
cat("SST =", SST, "\n")
cat("SSR =", SSR, "\n")
cat("SSE =", SSE, "\n")
cat("R^2 =", R2, "\n")

# 11) Verify with built-in lm()
fit <- lm(grade ~ study_hours, data = df)
cat("\n--- lm() summary ---\n")
print(summary(fit))

# 12) Plots
# Scatterplot + fitted line
p1 <- ggplot(df, aes(x = study_hours, y = grade)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Scatterplot of Grade vs Study Hours with Regression Line",
       x = "Study Hours (X)",
       y = "Grade (Y)")
print(p1)

# Residuals vs fitted
df$fit <- fitted(fit)
df$res <- resid(fit)

p2 <- ggplot(df, aes(x = fit, y = res)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0) +
  labs(title = "Residuals vs Fitted Values",
       x = "Fitted values",
       y = "Residuals")
print(p2)

# Normal Q-Q plot of residuals (base R)
qqnorm(df$res, main = "Normal Q-Q Plot of Residuals")
qqline(df$res)