###################
### Long Exam 1 ###
###  Fall 2012  ###
###################

## Problem 1
data=read.csv("HealthExam.csv",header=T)
attach(data)

# part (a)
mean(Age)
[1] 34.35

# part (b)
boxplot(Age)
# There is 1 outlier. The value of the outlier is 73

# part (c)
plot(Age,Height)
# No

# part (d)
cor(Weight,Cholesterol)^2
[1] 0.08737305
# Therefore, about 8.7% of the variability of Cholesterol can be
# attributed to a linear relationship with Weight. No, it wouldn't be 
# appropriate to use linear regression to model Cholesterol based on Weight.

# part (e)
plot(Waist,Weight)
# Yes
cor(Waist,Weight)^2
[1] 0.8250576
# Therefore, about 82.5% of the variability of Weight can be explained by
# a linear relationship with Waist.

# part (f)
lm.waist.weight=lm(Weight~Waist)
coef(lm.waist.weight)
(Intercept)       Waist 
-51.72790     2.39469 
# y.hat=2.39*Waist-51.73
# No, beta_0 has no meaningful interpretation because there's no person with 0 waist size.

# part (g)
predict(lm.waist.weight,newdata=data.frame(Waist=100),interval="prediction")
fit      lwr      upr
1 187.7411 158.1848 217.2975
# y.hat=187.74 and the prediction in is [158.2, 217.3]

# part (h)
predict(lm.waist.weight,newdata=data.frame(Waist=100),interval="confidence",level=.99)
fit      lwr      upr
1 187.7411 181.9085 193.5738
# mu.hat=187.74 and the confidence interval is [181.91, 193.57]
# Therefore, we are 99% confident that the mean weight of people with waist size of 100 cm
# is between 181.91 and 193.57 lbs.

# part (i)
# Since b1=2.39, then we expect the weight to increase by 2.39 pounds.

# part (j)
summary(lm.waist.weight)$coefficients

              Estimate Std. Error   t value     Pr(>|t|)
(Intercept) -51.72790 11.1288049 -4.648109 1.340596e-05
Waist         2.39469  0.1248554 19.179706 2.937783e-31

# t.obs=19.18, se.b1=0.1249, p-value is virtually 0. Since the p-value is less than 
# alpha=0.05, we reject Ho:beta_1=0. Therefore, we found sufficient evidence to say
# beta_1 is not 0. Hence, the weight of people is linearly related to their waist size.

# part (k)
anova(lm.waist.weight)

Response: Weight
          Df Sum Sq Mean Sq F value    Pr(>F)    
Waist      1  79284   79284  367.86 < 2.2e-16 ***
Residuals 78  16811     216                      

# The p-value is less than 2.2x10^(-16). Therefore, we reject the null hypothesis that
# Ho:beta_1=0. Therefore, we found sufficient evidence to say beta_1 is not 0. 
# Hence, the weight of people is linearly related to their waist size.

# part (l)
# H0:beta_1=1 vs. H1:beta_1>1
T.obs=(2.3947-1)/.1249                #T.obs= 11.1665
p.value=pt(11.1665,78,lower.tail=F)   #p.value=3.808953e-18
# Since the p-value is extremely small (smaller than alpha=0.05), we reject Ho. 
# Therefore, we have sufficient evidence to say that beta_1 is greater than 1.

# part (m)
cor.test(Waist,Weight)
# r=0.9083 and the confidence interval is [0.8603, 0.9404]
# Therefore, we are 95% confident that the true value of the correlation coefficient (rho)
# is between 0.86 and 0.94.

# part (n)
> cor(Waist,Weight,method="spearman")
[1] 0.9
> cor(Waist,Weight,method="kendall")
[1] 0.7305631

# part (o)
mse=anova(lm.waist.weight)$Mean[2]  # mse=215.5277
sigma.error.hat=sqrt(mse)           # 14.68

# part (p)
# The error terms are independent and normally distributed with mean 0 and constant variance.

# part (q)
shapiro.test(lm.waist.weight$res)
# W=0.9884 and the p-value=0.6937. Therefore, we did not find strong evidence
# against the normally of the residuals.


## Problem 2
# See pages 17 and 18 of our textbook.