Let’s read in the Duncan data and reproduce Figure 3.18 first.

prestige.data <- read.table("./Duncan.txt")

pairs(prestige.data[, c("income", "education", "prestige")])


You need to fill out the following sections for answering the questions.

Construct \(X\), \(X^TX\), and \(Y\).

# construct the design matrix
X <- as.matrix(cbind(1, prestige.data[, c(2, 3)]))
X <- as.matrix(cbind(1, prestige.data[, c("income", "education")]))


# construct the Gramian matrix
system.time(gramian.mat <- t(X) %*% X)
##    user  system elapsed 
##       0       0       0
system.time(gramian.mat <- crossprod(X))
##    user  system elapsed 
##       0       0       0
gramian.mat
##              1 income education
## 1           45   1884      2365
## income    1884 105148    122197
## education 2365 122197    163265
# calculate the inverse of the Gramian matrix
gramian.mat.inv <- solve(gramian.mat)

# construct the Y matrix
Y <- as.matrix(prestige.data[, "prestige"])

Q1. Now let’s calculate the least square estimates

# Now calculate the LS estimate
beta.estimate <- solve(t(X) %*% X) %*% t(X) %*% Y
beta.estimate <- solve(t(X) %*% X) %*% (t(X) %*% Y)
beta.estimate <- solve(t(X) %*% X, t(X) %*% Y)
beta.estimate

Q2. Now calculate the variance-covariance matrix to get the standard error of \(\hat{\beta}_1\)

# calculate sum of squared errors (or residuals)
sse <- 

# calculate mean squared errors
mse <- 

# calculate the variance-covariance matrix for beta hat
sigma.matrix <- 

# standard error for beta_1
sqrt(sigma.matrix[2, 2])

Q3. Hypothesis test of \(H_0: \beta_1 = 0\)

t(0.025, 42) critical value:

qt(1.0 - 0.025, 42) #take a look at the documentation of "qt"
# t statistic
beta.estimate[2] / sqrt(sigma.matrix[2, 2])

Q4. Mean prestige estimate given \(income = 42\) and \(education = 84\):

a.vector <- c(1, 42, 84)
mean.prestige.est <- 

Q5. standard error:

theta.se <- 

Q6. 95% confidence interval:

confidence.interval <- 

Use lm to check

(lmod <- lm(prestige ~ income + education, prestige.data))
## 
## Call:
## lm(formula = prestige ~ income + education, data = prestige.data)
## 
## Coefficients:
## (Intercept)       income    education  
##     -6.0647       0.5987       0.5458
summary(lmod)
## 
## Call:
## lm(formula = prestige ~ income + education, data = prestige.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.538  -6.417   0.655   6.605  34.641 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.06466    4.27194  -1.420    0.163    
## income       0.59873    0.11967   5.003 1.05e-05 ***
## education    0.54583    0.09825   5.555 1.73e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.37 on 42 degrees of freedom
## Multiple R-squared:  0.8282, Adjusted R-squared:   0.82 
## F-statistic: 101.2 on 2 and 42 DF,  p-value: < 2.2e-16
predict(lmod, newdata = data.frame("1" = 1, "income" = 42, "education" = 84), interval = "confidence")
##        fit     lwr      upr
## 1 64.93216 57.5322 72.33212

Q7. test the null hypothesis \(H_0: \beta_1 = \beta_2 = 0\). (Optional)

(lmod.red <- lm(prestige ~ 1, data = prestige.data))
mean(prestige.data[, "prestige"])

# calculate the F statistic
F.statistic <-
  
# use pf function to get the associated p-value
p.value <- pf(F.statistic, ndf, ddf, lower.tail=FALSE)