######################################
### Lecture R commands - Fall 2016 ###
######################################

#################################################
### Day 1 - September 7 - Writing R functions ###
#################################################
ss.xx=function(x){sum(x^2)-(sum(x))^2/length(x)}
ss.yy=function(y){sum(y^2)-(sum(y))^2/length(y)}        # Equivalent to ss.xx(y)
ss.xy=function(x,y){sum(x*y)-sum(x)*sum(y)/length(x)}

data.jog=read.csv("Jogging.csv",header=T)
attach(data.jog)
ssxx=ss.xx(Distance)           # 19.45455
ssyy=ss.xx(Time)               # 3025.091
ssxy=ss.xy(Distance,Time)      # 241.6364
r=ssxy/sqrt(ssxx*ssyy)         # 0.9960532

corr=function(x,y)
{
ss.xy=function(x,y){sum(x*y)-sum(x)*sum(y)/length(x)}
ssxy=ss.xy(x,y)
ssxx=ss.xy(x,x)
ssyy=ss.xy(y,y)
r=ssxy/sqrt(ssxx*ssyy)

list(corr=r,SSxx=ssxx,SSyy=ssyy,SSxy=ssxy)
}
corr(Distance,Time)


##########################################
### Day 2 - September 9 - Correlation  ###
##########################################
# Tree Height Example
data.tree=read.csv("TreeHeight.csv",header=T)
head(data.tree)					#Lists the variable names and their first 6 values
attach(data.tree)

source("C:\\Math445_Fall2013\\corr.R")	#Sourcing the 'corr.R' function we created
corr(Circumferenc,Height)			#Using the 'corr' function
cor(Circumference,Height)			#This is the built-in correlation function

# Jogging Example
data.jog=read.csv("Jogging.csv",header=T)
head(data.jog)
attach(data.jog)
plot(Distance,Time)

corr(Distance,Time)
cor(Distance,Time)

########################################################
### Day 3 - September 12 - Simple Linear Regression  ###
########################################################
# Jogging Example
data.jog=read.csv("Jogging.csv",header=T)
attach(data.jog)
plot(Distance,Time)

#Source in 'Corr' function
output=corr(Distance,Time)
attributes(output)
output

ssxy=output$SSxy
ssxx=output$SSxx

b1=ssxy/ssxx
b0=mean(Time)-b1*mean(Distance)

plot(Distance,Time)
abline(b0,b1,lwd=2)		# lwd=2 creates a thicker line

results=lm(Time~Distance)
attributes(results)
results$coef			# Or use 'coef(results)'

plot(Distance,Time)
abline(results)

# Estimating sigma
attributes(results)
residuals=results$res
sse=sum(residuals^2)			# sse=23.83178
n=length(Distance)
mse=sse/(n-2)				# mse=1.19
sigma.error=sqrt(mse)			# 1.09

predicted=results$fit
data.frame(Distance,Time,Predicted=predicted,Residuals=round(residuals,3))

temp=which(abs(residuals)>1.5)	
Time[temp]

summary(results)

####################################################
### Day 4 - September 14 - Inference about slope ###
####################################################
# Production Run Example
size=c(30,20,60,80,40,50,60,30,70,60)
hours=c(73,50,128,170,87,108,135,69,148,132)

lm.size.hours=lm(hours~size)
coef(lm.size.hours)
plot(size,hours)
abline(lm.size.hours)

# Estimating sigma
attributes(lm.size.hours)
residuals=lm.size.hours$res
sse=sum(residuals^2)			# sse=60
n=length(size)
mse=sse/(n-2)				# mse=7.5
sigma.error=sqrt(mse)			# 2.738613

predicted=lm.size.hours$fitted
data.frame(Size=size,Hours=hours,Predicted=predicted,Residuals=round(residuals,3))
which(abs(residuals)>2*sigma.error)	

summary(lm.size.hours)
anova(lm.size.hours)

sum(residuals)			#Verifying property #1
sum(hours);sum(predicted)	#Verifying property #3	
sum(size*residuals)		#Verifying property #4	
sum(predicted*residuals)	#Verifying property #5
10+2*mean(size);mean(hours)	#Verifying property #6

# Inference concerning beta_1

# Source in 'corr.R'
n=length(size) 
SSxx=corr(size,hours)$SSxx	# 3400
se.b1=sqrt(mse/SSxx)		# 0.04696682
t.b1=2/se.b1			# 42.58325
pvalue.b1=2*pt(-42.58325,df=n-2)

confint(lm.size.hours)

#############################################################
### Day 5 - September 16 - Prediction and Confidence Band ###
#############################################################
size=c(30,20,60,80,40,50,60,30,70,60)
hours=c(73,50,128,170,87,108,135,69,148,132)

lm.size.hours=lm(hours~size)
coef(lm.size.hours)
plot(size,hours)
abline(lm.size.hours)

# Confidence intervals for the mean response
new=data.frame(size=100)
predict(lm.size.hours,newdata=new)      # the predicted value of a new observation

# This will give the mean C.I. for the new data
predict(lm.size.hours,newdata=new,interval="confidence")
new2=data.frame(size=c(90,100))    	# new2 contains only the 2 new observations

# This will give the mean C.I. for the 2 new data
predict(lm.size.hours,newdata=new2,interval="confidence")		

# Prediction interval for future values
predict(lm.size.hours,newdata=new2,interval="prediction")    	# new2 contains only the 2 new observations
predict(lm.size.hours,newdata=new2,interval="prediction",level=.99)	# The default level is 0.95

# Confidence Band
lm.size.hours=lm(hours~size)
CI=predict(lm.size.hours,se.fit=TRUE)     # se.fit=SE(mean)
W=sqrt(2*qf(0.95,2,8))
band.lower=CI$fit - W*CI$se.fit
band.upper=CI$fit + W*CI$se.fit
band=cbind(band.lower,band.upper)

plot(size, hours, xlab="Production Size",
    ylab="Work Hours", main="Confidence Band")
abline(lm.size.hours)

points(sort(size), sort(band.lower), type="l", lty=2)
points(sort(size), sort(band.upper), type="l", lty=2)

# Or alternatively, you can use the following command:
# source in 'confidence.band.R'
confidence.band(lm.size.hours)

# Prediction Band
mse=anova(lm.size.hours)$Mean[2]
se.pred=sqrt(CI$se.fit^2+mse)
band.lower.pred=CI$fit - W*se.pred
band.upper.pred=CI$fit + W*se.pred

points(sort(size),sort(band.lower.pred),type="l",lwd=2,lty=2,col="Red")
points(sort(size),sort(band.upper.pred),type="l",lwd=2,lty=2,col="Red")


##############################################################################
### Day 6 - September 19 - ANOVA Approach and Coefficient of Determination ###
##############################################################################
size=c(30,20,60,80,40,50,60,30,70,60)
hours=c(73,50,128,170,87,108,135,69,148,132)

lm.size.hours=lm(hours~size)
residuals=lm.size.hours$residuals
predicted=lm.size.hours$fitted
n=length(size)

ssto=sum((hours-mean(hours))^2)     # ssto=13660
sse=sum(residuals^2)                # sse=60
ssr=sum((predicted-mean(hours))^2)  # ssr=13600, or ssr=ssto-sse
msr=ssr/1           # msr=13600
mse=sse/(n-2)       # mse=7.5
f.obs=msr/mse       # f.obs=1813.333
p.value=pf(f.obs,df1=1,df2=n-2,lower.tail=F)  # p.value=1.019588e-10

# Alternatively, you can use the 'anova' function
anova(lm.size.hours)

plot(size,(predicted-mean(hours)))
points(size,residuals,pch=19)       # This will superimpose these points to the plot

R2=ssr/ssto       # R2=0.9956076
# Therefore, about 99.56% of the variability of Hours (Y) can be explained by the 
# regression line.

## Test for Correlations
cor(size,hours)    		        # Computes the Pearson correlation coefficient, r
cor.test(size,hours)			    # Tests Ho:rho=0 and also constructs C.I. for rho

cor(size,hours,method="spearman")	# Computes the Spearman's correlation coefficient
cor.test(size,hours,method="spearman")  # Test of independence using the Spearman Rank correlation

cor(size,hours,method="kendall")	# Computes Kendall's Tau

cor.test(size,hours,conf.level=.99) # Tests Ho:rho=0 and also constructs C.I. for rho


##################################
### September 23 - Diagnostics ###
##################################

rm(list=ls())           # Clears the workspace

data=read.csv("Toluca.csv",header=T)
attach(data)

# Diagnostics for Predictor Variable

table(size)    		      # Creates a frequency table
barplot(table(size))		# Creates a frequency bargraph

boxplot(size)				    # Creates a boxplot (good for outlier detection)
size.out=c(size,200)
boxplot(size.out)			  # Note the presence of the outlier

plot(size,type="b")			# Creates a time/sequence plot

hist(size)					    # Creates a histogram
bin=seq(15,125,by=10)
hist(size,breaks=bin,col="gray")	

stem(size)					    # Creates a stem-and-leaf-plot
stem(size,scale=3)

# Dot plots 
stripchart(size, method = "stack", offset = 1, at = 0, pch = 19, 
           main = "Dotplot", xlab = "Lot Size")

plot(size, hours)
results=lm(hours~size)
abline(results,col="blue")

## Departures from the model ##

## 1. The regression function is not linear.
# Plot residuals againsts predictor OR residuals againsts fitted values
plot(size,results$residuals)		# Note that the residual plot doesn't show any systematic pattern.
plot(results$fitted,results$residuals)  # Again, no systematic pattern.

mse=anova(results)$Mean[2]		# use=2384
e.star=results$residuals/sqrt(mse)
plot(results$fitted,e.star)		

# A non-linear example
z=rnorm(length(size),0,100)
x=size
y=x^2+z
plot(x,y)					# Note that the relationship is curvilinear
cor(x,y)					# Note that r is pretty high!
temp=lm(y~x)
plot(temp$fitted,temp$res)		# Note the pattern in this residual plot

x2=x^2
temp2=lm(y~x2)
plot(temp2$fitted,temp2$res)

## 2. The error terms are not normally distributed.
# Plots of residuals againsts omitted predictor variables
# Box plot of residuals
# Normal probability plots of residuals
qqnorm(results$residuals)
qqline(results$residuals)
shapiro.test(results$residuals)

qqnorm(temp$residuals)
qqline(temp$residuals)
shapiro.test(temp$residuals)

## 3. The error terms do not have constant variance.
# Plot of absolute or squared residuals against predictor variable

z=rnorm(length(size),0,100*sqrt(size))
x=size
y=100+25*x+z
plot(x,y)    			# Note the increasing variation

temp3=lm(y~x)
plot(temp3$fitted,temp3$res)		# Note the increasing variation

## 4. The error terms are not independent.
# Sequence plot of the residuals
# Plot of residuals againsts time or other sequence

## 5. The model fits all but one or a few outlier observations.
# Plot residuals againsts predictor OR residuals againsts fitted values
# Box plots, stemplots, dot plots of residuals

## 6. One or several important predictor variables have been omitted from the model.

## Brown-Forsythe Test

bf=function(x,y,x.med=median(x)){
	results=lm(y~x)
	e=results$residuals
	# x.med=median(x)

	e1=e[x<=x.med]; n1=length(e1)
	e2=e[x>x.med]; n2=length(e2)

	d1=abs(e1-median(e1))
	d2=abs(e2-median(e2))

	s2=(sum((d1-mean(d1))^2)+sum((d2-mean(d2))^2))/(n1+n2-2)

	t.bf=(mean(d1)-mean(d2))/(sqrt(s2)*sqrt(1/n1+1/n2))
	p.val=2*pt(abs(t.bf),df=(n1+n2-2),lower.tail=F)

	list(t.bf=t.bf,p.val=p.val)}

# Example: Toluca data

# Source in the function bp.R
bf(size,hours)	# t_obs=1.3165 and p-value=0.201

# Example: Simulated data with increasing variance
x=sample(seq(20,120,by=10),100,replace=T)
z=rnorm(length(x),0,200*sqrt(x))
y=(62.37+3.57*x)+z
plot(x,y)					# Note the increasing variation

bf(x,y)

## Breusch-Pagan Test (Also known as Cook-Weisberg test)

# load the package 'lmtest'
library(lmtest)
bptest(hours~size,studentize=F)

Example 2:
x=1:50
y=(1:50)*rnorm(50)
bf(x,y)
bptest(y~x)

################################################
### Day 10 - September 28 - Lack of Fit Test ###
################################################

# Example: Bank data (page 120)
x=c(125,100,200,75,150,175,75,175,125,200,100)
y=c(160,112,124,28,152,156,42,124,150,104,136)

results=lm(y~x)
anova(results)

reduced.mod=lm(y~x)
full.mod=lm(y~factor(x))
anova(reduced.mod,full.mod)

>anova(results)
Analysis of Variance Table

Response: y
          Df  Sum Sq Mean Sq F value Pr(>F)
x          1  5141.3  5141.3  3.1389 0.1102
Residuals  9 14741.6  1638.0

> anova(reduced.mod,full.mod)
Analysis of Variance Table

Model 1: y ~ x
Model 2: y ~ factor(x)
  Res.Df   RSS Df Sum of Sq      F   Pr(>F)
1      9 14742
2      5  1148  4     13594 14.801 0.005594 **


###############################################
### Day 11 - September 30 - Transformations ###
###############################################

# Transforming the X variable
# Example (page 129)
x=c(.5,.5,1,1,1.5,1.5,2,2,2.5,2.5)
y=c(42.5,50.6,68.5,80.7,89,99.6,105.3,111.8,112.3,125.7)
results=lm(y~x)
summary(results)    	# R^2=0.9256
plot(x,results$res)		# note the curve pattern
qqnorm(results$res)		# no evidence of non-normality of residuals
bf(x,y)				# no evidence of deviation

xp=sqrt(x)
results=lm(y~xp)
summary(results)			# R^2=0.9545 - note the improvement
plot(xp,results$res)		
qqnorm(results$res)		

# Transforming the Y variable
# Example: Plasma (page 132)
data=read.csv("Plasma.csv",header=T)
attach(data)
plot(Age,Plasma)
results=lm(Plasma~Age)
plot(Age,results$res)
qqnorm(results$res)

log10plasma=log(Plasma,base=10)
plot(Age,log10plasma)
results2=lm(log10plasma~Age)
plot(Age,results2$res)
qqnorm(results2$res)


## Box Cox Transformation 
y=Plasma;x=Age

lambda=seq(-2,2,by=.1)			
lambda[21]=0.01				# Just to avoid the special case, lambda=0
iter=length(lambda)
SSEs=array(0,iter)			# Storage for the SSEs
k2=(prod(y))^(1/length(y))

for(i in 1:iter){
k1=1/(lambda[i]*k2^(lambda[i]-1))
w=k1*(y^lambda[i]-1)
results=lm(w~x)
SSEs[i]=anova(results)$Sum[2]}

w=k2*(log(y))				# This is for the special case when lambda=0
results=lm(w~x)
SSEs[21]=anova(results)$Sum[2]

lambda[21]=0
cbind(lambda,SSEs)			# Choose lambda that minimizes the SSE
plot(lambda,SSEs)

# load package 'MASS'
library(help="MASS")
boxcox(y~x)					# Check the resulting plot

# Lowess Method
data.mm=read.csv("MuscleMass.csv",header=T)
attach(data.mm)
plot(age,mass)
lines(lowess(age,mass))


############################################
###  October 3 - Simultaneous Inferences ###
############################################

## Bonferroni Joint Estimation of beta_0 and beta_1

data=read.csv("Toluca.csv",header=T)
attach(data)

results=lm(hours~size)
confint(results,level=.90)
coefficients(summary(results))

b0=coefficients(summary(results))[1,1]  # Extracting the value of b0
se.b0=coefficients(summary(results))[1,2]  # The standard error of b0

b1=coefficients(summary(results))[2,1]	# Extracting the value of b1
se.b1=coefficients(summary(results))[2,2] # The standard error of b1

# The 90% joint confidence intervals for beta_0 and beta_1:
B=qt(1-.10/(2*2),df=23)				# In general, B=qt(1-.10/(2*k),df), k=no. of intervals
lower.b0=b0-B*se.b0;upper.b0=b0+B*se.b0
lower.b1=b1-B*se.b1;upper.b1=b1+B*se.b1

# Or you can make the Bonferroni adjustment here
confint(results,level=(1-.10/2))		# In general, confint(results,level=(1-alpha/k))	


## Simultaneous Estimation of Mean Responses

new=data.frame(size=c(30,65,100))
ci=predict(results,new,interval="confidence",se.fit=T)

# Using Working-Hotelling Procedure 
# to constuct 90% joint confidence intervals for the mean response at the 3 x levels.
W2=2*qf(1-.10,2,23)

ci$fit[1]-sqrt(W2)*ci$se[1]; ci$fit[1]+sqrt(W2)*ci$se[1]	# at size=30
ci$fit[2]-sqrt(W2)*ci$se[2]; ci$fit[2]+sqrt(W2)*ci$se[2]	# at size=65
ci$fit[3]-sqrt(W2)*ci$se[3]; ci$fit[3]+sqrt(W2)*ci$se[3]	# at size=100

# Using Bonferroni Procedure
B=qt(1-.10/(2*3),23)


ci$fit[1]-B*ci$se[1]; ci$fit[1]+B*ci$se[1]	# at size=30
ci$fit[2]-B*ci$se[2]; ci$fit[2]+B*ci$se[2]	# at size=65
ci$fit[3]-B*ci$se[3]; ci$fit[3]+B*ci$se[3]	# at size=10

# Or you can make the Bonferroni adjustment here
predict(results,new,interval="confidence",level=(1-.10/3))


## Simultaneous Estimation of Mean Responses

data=read.csv("data_Toluca.csv",header=T)
attach(data)

results=lm(hours~size)
MSE=anova(results)$Mean[2]

new=data.frame(size=c(80,100))
ci=predict(results,new,interval="prediction",se.fit=T)

se.fit.pred=sqrt(MSE*(1+(ci$se.fit^2/MSE)))

# Scheffe Procedure
S=sqrt(2*qf(1-.05,2,23))		# In general, use S=sqrt(k*qf(1-.05,k,n-2))
						# If k=2:10, look at plot(k,sqrt(k*qf(1-.05,k,n-2)))
lwr.scheffe=ci$fit[,1]-S*se.fit.pred
upr.scheffe=ci$fit[,1]+S*se.fit.pred
cbind(ci$fit,lwr.scheffe,upr.scheffe)

# Bonferroni Procedure
ci.bon=predict(results,new,interval="prediction",level=(1-.05/2))


## Regression through Origin

data=read.csv("Warehouse.csv",header=T)  # Example in the book (page 162)
attach(data)

n=length(cost)
b1=sum(units*cost)/sum(units^2)
fitted=b1*units
residuals=cost-fitted
SSE=sum(residuals^2)
MSE=SSE/(n-1)
se.b1=sqrt(MSE/sum(units^2))

# The 95% C.I. for beta_1 is 
lwr=b1-qt(.975,n-1)*se.b1
upr=b1+qt(.975,n-1)*se.b1

# Or you can use the following R command:
results=lm(cost~0+units)
summary(results)
confint(results)
anova(results)

#####################################
###  October 12 - Matrix Approach ###
#####################################

A=matrix(1:6,nrow=2)
B=matrix(c(-1,0,2,1,4,3),nrow=2)
A
B
A+B    		# Matrix addition = addition of corresponding entries
A-B	
t(A)				# transpose of A
D=t(A)%*%A			# To do matrix multiplication, use %*%
E=A%*%t(A)			# Note that E is not equal to D.
E.inv=solve(E)		# Computing the inverse of a square matrix
I=E.inv%*%E			# The product of E and it's inverse is the Identity matrix

data=read.csv("Toluca.csv",header=T)
attach(data)
x=size; y=hours
n=length(x)
t(y)%*%y					# Sum of squares of yi's
X=matrix(c(rep(1,n),x),ncol=2)	# The design matrix X
t(X)%*%X			
t(X)%*%y			
solve(t(X)%*%X)
B=solve(t(X)%*%X)%*%(t(X)%*%y)	# Estimates of the regression coefficients
coefficients(lm(y~x))			# Note that you get these results

fitted=X%*%B


#########################
## Multiple Regression ##
#########################

data=read.csv("Dwaine.csv",header=T)      # Dwaine Studios example on page 236
attach(data)
y=sales           # Sales of portraits of children in a community
x1=young          # Number of persons aged 16 or younger in the community
x2=income         # Per capita disposable personal income in the community
n=length(sales)
pairs(cbind(x1,x2,y))    	# Creates a scatter plot matrix
pairs(data)
cor(data)					# Creates a correlation matrix

X=matrix(c(rep(1,n),x1,x2),ncol=3)	# The design matrix X
t(X)%*%X			
B=solve(t(X)%*%X)%*%(t(X)%*%y)	# Estimates of the regression coefficientspair

#Or we can use
results=lm(y~x1+x2)
summary(results)

# Diagnostics
par(mfrow=c(2,2))
plot(results$fitted,results$residuals)
plot(x1,results$residuals)
plot(x2,results$residuals)
plot(x1*x2,results$residuals)		# Checking for interaction effects

plot(results$fitted,abs(results$residuals))	# Checking constant variance
qqnorm(results$residuals)				# Checking normality
shapiro.test(results$residuals)

anova(results)
# F Test for Regression Relation
# Coefficient of multiple determination
# Adjusted coefficient of multiple determination
# Coefficient of multiple correlation


#Prediction
new=data.frame(x1=65.4,x2=17.6)
predict(results,new)
predict(results,new,interval="confidence")
predict(results,new,interval="prediction")

new2=data.frame(x1=c(65.4,53.1),x2=c(17.6,17.7))
predict(results,new2,interval="prediction",level=(1-.10/2)) # Bonferroni Intervals


# Health Exam Data Example
data.health=read.csv("HealthExam.csv",header=T)
head(data.health)

attach(data.health)
result2=lm(Weight~Waist+Height)
summary(result2)

Weight1=-201.5717+2.1565*80+2.5978*60  	# 126.8163
Weight2=-201.5717+2.1565*90+2.5978*70		# 174.3593

predict(result2,newdata=data.frame(Waist=c(80,90),Height=c(60,70)))

confint(result2,level=.99)

# Model Assessments
qqnorm(result2$res);qqline(result2$res)
shapiro.test(result2$res) 
# W = 0.9884, p-value = 0.6898

plot(result2$fit,result2$res)


##############################################
###  October 17 - More Multiple Regression ###
##############################################

data.body=read.csv("BodyFat.csv",header=T)
attach(data.body)
y=fat
x1=triceps
x2=thigh
x3=midarm

results.12=lm(y~x1+x2)
summary(results.12)
anova(results.12)
results.1=lm(y~x1)
anova(results.1)
# ssr(x2|x1)=ssr(x1,x2)-ssr(x1)=sse(x1)-sse(x1,x2)
# f_obs=(ssr(x2|x1)/1)/(sse(x1,x2)/(n-3))    # Ho: beta_2=0

results.123=lm(y~x1+x2+x3)
# ssr(x3|x1,x2)=ssr(x1,x2,x3)-ssr(x1,x2)=sse(x1,x2)-sse(x1,x2,x3)
# f_obs=(ssr(x3|x1,x2)/1)/(sse(x1,x2,x3)/(n-4))		# Ho: beta_3=0

# ssr(x2,x3|x1)=sse(x1)-sse(x1,x2,x3)=ssr(x1,x2,x3)-ssr(x1)
# f_obs=(ssr(x2,x3|x1)/2)/(sse(x1,x2,x3)/(n-4))		# Ho: beta_2=0 and beta_3=0

#Or you can use these commands:
red=lm(y~x1)
full=lm(y~x1+x2+x3)
anova(red,full)			# Partial F-Test for Ho: beta_2=0 and beta_3=0


## Coefficient of Partial Determination (Oct. 19)

anova(results.12)
anova(results.1)

# R^2(2|1)=ssr(x2|x1)/sse(x1)=33.17/143.12=0.232
# Hence, the SSE(x1) is reduced by 23.2 percent.

anova(results.123)
anova(results.12)

# R^2(3|12)=ssr(x3|x1,x2)/sse(x1,x2)=11.55/109.95=0.105 

## Standardized Regression Model
# We use this method when det(X'X) is close to zero OR 
# when explanatory variables differ substantially in order of magnitude

data=read.csv("DwaineStudios.csv",header=T)
attach(data)
x1=young
x2=income
y=sales
n=length(sales)

x1.star=((x1-mean(x1))/sd(x1))/sqrt(n-1)
x2.star=((x2-mean(x2))/sd(x2))/sqrt(n-1)
y.star=((y-mean(y))/sd(y))/sqrt(n-1)

results.star=lm(y.star~0+x1.star+x2.star)
#coef(results.star)
#  x1.star   x2.star 
#0.7483670 0.2511039

b1=(sd(y)/sd(x1))*0.7484
b2=(sd(y)/sd(x2))*0.2511
b0=mean(y)-b1*mean(x1)-b2*mean(x2)

#check
lm(y~x1+x2)


##  Multicollinearity 

# Uncorrelated predictors
x1=c(4,4,4,4,6,6,6,6)
x2=c(2,2,3,3,2,2,3,3)
cor(x1,x2)
y=c(42,39,48,51,49,53,61,60)
anova(lm(y~x1))
anova(lm(y~x2))
anova(lm(y~x1+x2))    	# Note that ssr(x2)=ssr(x2|x1)

# Perfectly correlated predictors
x3=c(2,8,6,10)
x4=c(6,9,8,10)
cor(x3,x4)
coefficients(lm(x4~x3))
y2=c(23,83,63,103)

# y2_hat1=-87+x3+18*x4
# y2_hat2=-7+9*x3+2*x4
# y2_hat3=-17+8*x3+4*x4			# Note that all 3 models give perfect fit.

data=read.csv("BodyFat.csv",header=T)
attach(data)
pairs(data)
cor(data)
summary(lm(midarm~triceps+thigh))
x1=triceps
x2=thigh
x3=midarm
y=fat

coefficients(lm(y~x1))
coefficients(lm(y~x2))
coefficients(lm(y~x1+x2))		# Note how the beta estimates drastically changes as you 
coefficients(lm(y~x1+x2+x3))		# include highly correlated predictors 

# SSR(x1|x2) and R^2(1|2)
anova(lm(y~x2+x1))		# Note how small the marginal contribution of x1 when x2 is already in the model.

# SSR(x1|x2) can sometimes be bigger than SSR(x1)
x1=c(5,10,5,10)
x2=c(25,30,5,10)
y=c(20,20,0,1)
temp=data.frame(y=y,x1=x1,x2=x2)
cor(temp)
anova(lm(y~x2))
anova(lm(y~x1+x2))	# x1 is a suppressor variable

# Predictions are still good even with multicollinearity of predictors 
x1=triceps
x2=thigh
x3=midarm
y=fat
results=lm(y~x1)
predict(results,new=data.frame(x1=25),interval="confidence",se.fit=T)

results2=lm(y~x1+x2)
predict(results2,new=data.frame(x1=25,x2=50),interval="confidence",se.fit=T)

results3=lm(y~x1+x2+x3)
predict(results3,new=data.frame(x1=25,x2=50,x3=29),interval="confidence",se.fit=T)

library(car)
vif(lm(y~x1+x2+x3))

###########################################
###  October 21 - Polynomial Regression ###
###########################################

n=50
x=sample(20:40,n,replace=T)
y=100-30*x+0.5*x^2 + rnorm(n,mean=0,sd=10)
plot(x,y)

x.2=x^2
cor(x,x.2)
summary(lm(y~x+x.2))

xc=x-mean(x)
xc.2=xc^2
cor(xc,xc.2)
results=(lm(y~xc+xc.2))
summary(results)
b0=coef(results)[1]
b1=coef(results)[2]
b2=coef(results)[3]

b0.orig=b0-b1*mean(x)+b2*(mean(x))^2
b1.orig=b1-2*b2*mean(x)
b2.orig=b2
curve(b0.orig+b1.orig*x+b2.orig*x^2,20,40,add=T)

y.hat=b0.orig+b1.orig*x+b2.orig*x^2 # Or use, yhat=results$fitted
residuals=y-y.hat                   # Or use, residuals=results$res

# Testing lack of fit of the quadratic regression
red=lm(y~xc+xc.2)
full=lm(y~factor(xc))
anova(red,full)

# Power Cell Example
data=read.csv("PowerCells.csv",header=T)
attach(data)
cor(charge,charge^2)    		# Note how correlated they are.
cor(temperature,temperature^2)		

y=cycles
x1=(charge-mean(charge))/.4			# Transforming will reduce the correlation
x2=(temperature-mean(temperature))/10
x1.2=x1^2						# Check cor(x1,x1.2)
x2.2=x2^2						# Check cor(x2,x2.2)
x1x2=x1*x2
summary(lm(y~x1+x1.2+x2+x2.2+x1x2))		# Note that the interaction effect is not significant.
summary(lm(y~x1+x1.2+x2+x2.2))		# Note that x2^2 is not significant.
summary(lm(y~x1+x1.2+x2))			# Note that x1^2 is not significant.
summary(lm(y~x1+x2))				# Note how much R^2 changed as you remove variables from the model.
Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  172.000      9.354  18.387 7.88e-08 ***
x1           -55.833     12.666  -4.408 0.002262 ** 
x2            75.500     12.666   5.961 0.000338 ***

# Changing back to the original data:
b0.orig=172+55.833*mean(charge)/.4-75.5*mean(temperature)/10  # b0.orig=160.5825
b1.orig=-55.833/.4  # b1.orig=-139.5825
b2.orig=75.5/10     # b0.orig=7.55
y.hat=160.58-139.58*charge+7.55*temperature

# Or you can test Ho:beta_11=beta_22=beta_12=0
red=lm(y~x1+x2)
full=lm(y~x1+x1.2+x2+x2.2+x1x2)
anova(red,full)
Model 1: y ~ x1 + x2
Model 2: y ~ x1 + x1.2 + x2 + x2.2 + x1x2
  Res.Df    RSS Df Sum of Sq      F Pr(>F)
1      8 7700.3                           
2      5 5240.4  3    2459.9 0.7823 0.5527
# Since the p-value=0.5527, we can conclude Ho, that no curvature and interaction effects are needed.


##############################################
### Regression with Qualitative Predictors ###
##############################################

data=read.csv("Insurance.csv",header=T)  	# Example on page 316
attach(data)
type=factor(type)   # 0=Mutual; 1=Stock
results=lm(months~size+type)
summary(results)$coef
              Estimate  Std. Error    t value     Pr(>|t|)
(Intercept) 33.8740690 1.813858297  18.675146 9.145269e-13
size        -0.1017421 0.008891218 -11.442990 2.074687e-09
type1        8.0554692 1.459105700   5.520826 3.741874e-05

plot(size,months,pch=as.numeric(type))
abline(33.874,-0.102,lwd=2)					# For mutual firms
abline(33.874+8.055,-0.102,lwd=2,col="darkred")		# For stock firms
legend(250,35,c("Mutual","Stock"),pch=1:2)

# Interactions between quantitative and qualitative predictors
results2=lm(months~size+type+size*type)
summary(results2)$coef
                 Estimate Std. Error     t value     Pr(>|t|)
(Intercept) 33.8383694765 2.44064985 13.86449166 2.472768e-10
size        -0.1015306249 0.01305254 -7.77861250 7.965637e-07
type1        8.1312501223 3.65405169  2.22526959 4.079375e-02
size:type1  -0.0004171412 0.01833121 -0.02275578 9.821265e-01

# Another example
data=read.csv("Crop.csv",header=T)
data=attach(data)

mc=SoilMerCon			# Mercury concentration in the soil (x1)
cr=Crop				# Type of crop (barley, corn, or wheat) (x2)
pc=PlantMerCon			# Mercury concentration in the plnat (y)

plot(mc,pc,pch=as.numeric(cr))		# Will create scatterplots for the 3 different crops
legend(1.5, 135, c("Barley","Corn","Wheat"), pch=1:3)

tapply(pc[mc==6],cr[mc==6],max)		# To determine which values are above

mc.fac=factor(mc)			# Declaring that mc is a qualitative variable
results=lm(pc~mc.fac)
summary(results)

# another example
data.fertilizer=read.csv("Fertilizer.csv",header=T)
attach(data.fertilizer)

plot(height,yield,pch=as.numeric(treatment))  	# Will create scatterplots for the 3 different crops
legend(55, 15.5, c("Control","Fast","Slow"), pch=1:3)

summary(lm(yield~treatment+height+treatment:height))$coef
summary(lm(yield~treatment*height))$coef
                      Estimate  Std. Error    t value     Pr(>|t|)
(Intercept)        9.491768741 0.202459041  46.882415 4.084943e-25
treatmentf        -3.519620102 0.350334683 -10.046451 4.489809e-10
treatments         3.906558043 0.295789635  13.207217 1.675966e-12
height             0.056614405 0.004265179  13.273628 1.506750e-12
treatmentf:height  0.006814587 0.006746322   1.010119 3.225144e-01
treatments:height -0.006886936 0.006084213  -1.131935 2.688450e-01

summary(lm(yield~treatment+height))$coef
               Estimate  Std. Error   t value     Pr(>|t|)
(Intercept)  9.52925636 0.133573491  71.34092 2.346301e-31
treatmentf  -3.14415561 0.060373900 -52.07806 7.942625e-28
treatments   3.57163712 0.057032666  62.62441 6.817469e-30
height       0.05580995 0.002734287  20.41115 1.580422e-17

tapply(yield,treatment,mean)
tapply(height,treatment,mean)


####################################
### October 28 - Model Selection ###
####################################

data=read.csv("SurgicalUnit.csv",header=T)
attach(data)

y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
cor(cbind(y,x1,x2,x3,x4))
pairs(cbind(y,x1,x2,x3,x4))    # Note that some relationships look more exponential 
pairs(cbind(log(y),x1,x2,x3,x4))	# Note that the relationships are more linear

temp=lm(y~x1+x2+x3+x4)
plot(temp$fit,temp$res)			# Note that the residual plot exhibits noncontancy of variance.
log.y=log(y)					
temp=lm(log.y~x1+x2+x3+x4)
plot(temp$fit,temp$res)			# Note that the residual plot looks much better.

dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4)

### Criteria for Model Selection
# 1. R^2 or SSE Criterion
# Consider the regression model log.y~x1+x2+x3
summary(lm(log.y~x1+x2+x3))$r.sq		# R^2 = 0.757
anova(lm(log.y~x1+x2+x3))$Sum[4]		# SSE = 3.109

# 2. R^2_adj or MSE Criterion 	
	(Choose model with the largest R^2_adj value or smallest MSE value)
# Consider the regression model log.y~x1+x2+x3
summary(lm(log.y~x1+x2+x3))$adj.r.sq	# Adjusted R^2 = 0.743
anova(lm(log.y~x1+x2+x3))$Mean[4]		# MSE = 0.0622

# 3. Cp (Mallow's Cp statistic) Criterion: Cp=SSE.p/MSE.full - (n-2p)
	(Good model should give a small Cp that is approximately equal to p, where p=no. of parameters.)
# Consider the regression model log.y~x1+x2+x3
SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4]
MSE.full=anova(lm(log.y~x1+x2+x3+x4))$Mean[5]
n=length(log.y); p=4			# p is the no. of parameters (B0, B1, B2, B3)
SSE.p/MSE.full - (n-2*p)		# Cp = 3.388

#Or alternatively, you can use:
extractAIC(lm(log.y~x1+x2+x3),scale=MSE.full)[2]

# 4. AIC (Akaike's information criterion): AIC = n*ln(SSE.p/n)+2p 
	(Choose model with the smallest AIC value)
# Consider the regression model log.y~x1+x2+x3
SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4]
n=length(log.y); p=4			# p is the no. of parameters (B0, B1, B2, B3)
n*log(SSE.p/n)+2*p				# AIC = -146.161

#Or alternatively, you can use:
extractAIC(lm(log.y~x1+x2+x3))[2]

# 5. SBC (Schwarz's Bayesian criterion): SBC = n*ln(SSE.p/n)+ln(n)*p
	(Choose model with the smallest SBC value)
SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4]
n=length(log.y); p=4			# p is the no. of parameters (B0, B1, B2, B3)
n*log(SSE.p/n)+log(n)*p			# SBC = -138.205

#Or alternatively, you can use:
extractAIC(lm(log.y~x1+x2+x3),k=log(n))[2]

# 6. PRESS criterion: PRESS.p = sum(y_i - y_i_hat(when ith observation was deleted))^2
	(Choose model with the smallest value)
# Consider the regression model log.y~x1+x2+x3
n=length(log.y)
e2=array(0,n)         
for(i in 1:n){
w1=x1[-i];w2=x2[-i];w3=x3[-i]	    # removing the ith value of x.
temp=lm(log.y[-i]~w1+w2+w3)
new=data.frame(w1=x1[i],w2=x2[i],w3=x3[i])
pred=predict(temp,new)
e2[i]=(log.y[i]-pred)^2
}
PRESS.p=sum(e2)					# PRESS.p = 3.914

#Or alternatively, you can use:
fit=lm(log.y~x1+x2+x3)
sum((fit$resid/(1-hatvalues(fit)))^2)

## You can get several of these criteria using the R package "leaps".
library(leaps)
log.y=log(y)
dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4)
results=regsubsets(log.y~.,data=dataset,nbest=6,nvmax=4)
attributes(summary(results))
outmat=summary(results)$outmat
SSE=round(summary(results)$rss,3)
R2=round(summary(results)$rsq,3)
R2.adj=round(summary(results)$adjr2,3)
Cp=round(summary(results)$cp,3)
BIC=round(summary(results)$bic,3)		# BIC is equivalent to SBC
data.frame(outmat,SSE,R2,R2.adj,Cp,BIC)


# Another example
library(faraway)		# will load the 'faraway' package
library(help=faraway)
data(seatpos)
seatpos
help(seatpos)

reg=lm(hipcenter~.,data=seatpos)
summary(reg)

results2=regsubsets(hipcenter~.,data=seatpos,nbest=2)
summary(results2)

####################################
###  Stepwise Regression Methods ###
####################################

data=read.csv("SurgicalUnit.csv",header=T)
attach(data)

y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
x5=age
x6=gender
x7=moderate
x8=heavy
log.y=log(y)
dataset=data.frame(log.y,x1,x2,x3,x4,x5,x6,x7,x8)

library(leaps)
results=regsubsets(log.y~.,data=dataset,nbest=4,nvmax=8)
attributes(summary(results))
outmat=summary(results)$outmat
SSE=round(summary(results)$rss,3)
R2=round(summary(results)$rsq,3)
R2.adj=round(summary(results)$adjr2,3)
Cp=round(summary(results)$cp,3)
BIC=round(summary(results)$bic,3)  		# BIC is equivalent to SBC
data.frame(outmat,SSE,R2,R2.adj,Cp,BIC)

# Using SBC or BIC, the best model includes x1,x2,x3, and x8.
# Using Cp, the best model includes x1,x2,x3,x5, and x8.
# Using R2.adj, the best model includes x1,x2,x3,x5,x6, and x8. 

## Forward Selection ##
# Add one variable at a time. 
# Insert the variable that yields the smallest (t-statistic) p-value.
# stage 1: x3 enters the model
coefficients(summary(lm(log.y~x1)))
pval.x1=coefficients(summary(lm(log.y~x1)))[2,4]
pval.x2=coefficients(summary(lm(log.y~x2)))[2,4]
pval.x3=coefficients(summary(lm(log.y~x3)))[2,4]
pval.x4=coefficients(summary(lm(log.y~x4)))[2,4]
pval.x5=coefficients(summary(lm(log.y~x5)))[2,4]
pval.x6=coefficients(summary(lm(log.y~x6)))[2,4]
pval.x7=coefficients(summary(lm(log.y~x7)))[2,4]
pval.x8=coefficients(summary(lm(log.y~x8)))[2,4]
pvals=c(pval.x1,pval.x2,pval.x3,pval.x4,pval.x5,pval.x6,pval.x7,pval.x8)
which(pvals==min(pvals))					# x3 has the smallest p-value (8.38e-08)

# stage 2: x2 joins the model with x3
pval.x3x1=coefficients(summary(lm(log.y~x3+x1)))[3,4]
pval.x3x2=coefficients(summary(lm(log.y~x3+x2)))[3,4]
pval.x3x4=coefficients(summary(lm(log.y~x3+x4)))[3,4]
pval.x3x5=coefficients(summary(lm(log.y~x3+x5)))[3,4]
pval.x3x6=coefficients(summary(lm(log.y~x3+x6)))[3,4]
pval.x3x7=coefficients(summary(lm(log.y~x3+x7)))[3,4]
pval.x3x8=coefficients(summary(lm(log.y~x3+x8)))[3,4]
pvalsx3=c(pval.x3x1,pval.x3x2,pval.x3x4,pval.x3x5,pval.x3x6,pval.x3x7,pval.x3x8)
which(pvalsx3==min(pvalsx3))				# x2 has the smallest p-value (2.23e-07)

# stage 3: x8 joins the model with x3 and x2
pval.x3x2x1=coefficients(summary(lm(log.y~x3+x2+x1)))[4,4]
pval.x3x2x4=coefficients(summary(lm(log.y~x3+x2+x4)))[4,4]
pval.x3x2x5=coefficients(summary(lm(log.y~x3+x2+x5)))[4,4]
pval.x3x2x6=coefficients(summary(lm(log.y~x3+x2+x6)))[4,4]
pval.x3x2x7=coefficients(summary(lm(log.y~x3+x2+x7)))[4,4]
pval.x3x2x8=coefficients(summary(lm(log.y~x3+x2+x8)))[4,4]
pvalsx3x2=c(pval.x3x2x1,pval.x3x2x4,pval.x3x2x5,pval.x3x2x6,pval.x3x2x7,pval.x3x2x8)
which(pvalsx3x2==min(pvalsx3x2))			# x8 has the smallest p-value (5.5e-06)

# stage 4: x1 joins the model with x3, x2, and x8
pval.x3x2x8x1=coefficients(summary(lm(log.y~x3+x2+x8+x1)))[5,4]
pval.x3x2x8x4=coefficients(summary(lm(log.y~x3+x2+x8+x4)))[5,4]
pval.x3x2x8x5=coefficients(summary(lm(log.y~x3+x2+x8+x5)))[5,4]
pval.x3x2x8x6=coefficients(summary(lm(log.y~x3+x2+x8+x6)))[5,4]
pval.x3x2x8x7=coefficients(summary(lm(log.y~x3+x2+x8+x7)))[5,4]
pvalsx3x2x8=c(pval.x3x2x8x1,pval.x3x2x8x4,pval.x3x2x8x5,pval.x3x2x8x6,pval.x3x2x8x7)
which(pvalsx3x2x8==min(pvalsx3x2x8))		# x1 has the smallest p-value (0.00033)

# stage 5: None of the remaining 4 variables can get into the model (all p-values > 0.05)
pval.x3x2x8x1x4=coefficients(summary(lm(log.y~x3+x2+x8+x1+x4)))[6,4]
pval.x3x2x8x1x5=coefficients(summary(lm(log.y~x3+x2+x8+x1+x5)))[6,4]
pval.x3x2x8x1x6=coefficients(summary(lm(log.y~x3+x2+x8+x1+x6)))[6,4]
pval.x3x2x8x1x7=coefficients(summary(lm(log.y~x3+x2+x8+x1+x7)))[6,4]
pvalsx3x2x8x1=c(pval.x3x2x8x1x4,pval.x3x2x8x1x5,pval.x3x2x8x1x6,pval.x3x2x8x1x7)
pvalsx3x2x8x1						# All p-values are > 0.05 --> stop

# Therefore, the best model obtained using the forward selection is
# log.y~x1+x2+x3+x8

# Or you can use:
reg.null=lm(log.y~1,data=dataset)
reg.full=lm(log.y~.,data=dataset)
step(reg.null,scope=list(lower=reg.null,upper=reg.full),direction="forward") 	# Criterion used is AIC, not t-statistic.


## Backward Selection ##
# Start with all the variables in the model, then remove one at a time.
coefficients(summary(lm(log.y~x1+x2+x3+x4+x5+x6+x7+x8))) 	# Remove x4.
coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x7+x8))) 	# Remove x7.
coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x8))) 		# Remove x5.
coefficients(summary(lm(log.y~x1+x2+x3+x6+x8))) 		# Remove x6.
coefficients(summary(lm(log.y~x1+x2+x3+x8))) 			# All p-values are now < alpha

# Therefore, the best model obtained using the forward selection is
# log.y~x1+x2+x3+x8

# Or you can use:
reg.full=lm(log.y~.,data=dataset)
step(reg.full,direction="backward") 	# Criterion used is AIC, not t-statistic.


## (Forward) Stepwise Selection ##
# This procedure is just like in the 'forward' selection. The only difference is, 
# every time you include a new variable into the model, you need to evaluate whether
# any of the variables already in the model should now be removed (based on their t-statistic
# p-value). See page 365 of our textbook for more information.

# R command using the AIC criterion
reg.null=lm(log.y~1,data=dataset)
reg.full=lm(log.y~.,data=dataset)
step(reg.null,scope=list(lower=reg.null,upper=reg.full),direction="both") 	# Criterion used is AIC, not t-statistic.


## Validation
temp=sample(1:54,20)
data.validation=data[temp,]
data.training=data[-temp,]

data.valid=read.csv("SurgicalValidation.csv",header=T)
attach(data.valid)
model.1=lm(log(survival)~clotting+prognostic+enzyme+heavy)	

MSPR=sum(model.1$residuals^2)/length(survival)			# MSPR = 0.07026

coefficients(summary(lm(log.y~x1+x2+x3+x8)))


####################################
### Model Building - Diagnostics ###
####################################

x1=c(14,19,12,11)
x2=c(25,32,22,15)
y=c(301,327,246,187)

X=cbind(array(1,4),x1,x2)
H=X%*%solve(t(X)%*%X)%*%t(X)  # Computing the hat matrix
y.hat=H%*%y				# Will give the predicted values(see table on page 393)
e=(diag(4)-H)%*%y			# Will give the residuals
h=diag(H)				# Extracting the main diagonal from the hat matrix
n=length(x1)
MSE=sum(e^2)/(n-3)
s2.e=MSE*(diag(4)-H)		# Will give the variance and co-variance matrix 
s.e=sqrt(diag(s2.e))
r.stud=e/s.e			# Will give the studentize residuals

d=e/(1-h)				# Will give the studentize deleted residuals

# Or you can use these built-in R function:
results=lm(y~x1+x2)
hatvalues(results)		# Will give the main diagonal of the H matrix.
rstandard(results)		# Will give the studentized residuals.
rstudent(results)			# Will give the studentized deleted residuals.

# Body fat example
data=read.csv("BodyFat.csv",header=T)
attach(data)
results.fat=lm(fat~triceps+thigh,data=data)
e=round(results.fat$residuals,3)
sse=sum(e^2)
n=length(e)
h=round(hatvalues(results.fat),3)		
t=round(e*sqrt((n-3-1)/(sse*(1-h)-e^2)),3)
data.frame(residuals=e,h=h,t=t)	# Table 10.3 on page 397

t.crit=qt(1-.10/(2*n),df=n-3-1)
which(abs(t)>t.crit)

# Leverage
which(h>2*mean(h))			# Note that sum(h)=p  ==>  mean(h)=p/n

# Deletion statistics
dff=dffits(results.fat)			# Compare abs(dffits) with 1 for small to medium size data and 2*sqrt(p/n) for large data sets.
dfb=dfbetas(results.fat) 		# Compare abs(dfbeta) with 1 (if small or medium data) or 2/sqrt(n) (for large data sets)
cooks=cooks.distance(results.fat)	# Compare with qf(.10,p,n-p) or qf(.20,p,n-p) 
round(data.frame(dffits=dff,cooks=cooks,dfbetas=dfb),3)

# Index plots
plot(dff,type='b',main="DFFITS Values")

summary(influence.measures(results.fat))	# This will give a summary of influential cases.

# Variance Inflation Factor (If VIF > 10, there is a strong collinearity)
library(car)  			# Install and load R package 'car'

results1=lm(fat~thigh+midarm)
vif(results2)
#   thigh  midarm
# 1.00722 1.00722 
# Note that all VIFs are small and very close to 1. 
# Therefore, we don't have collinearity problem.
cor(thigh,midarm)        #cor=0.0846675

results2=lm(fat~thigh+midarm)
vif(results2)  
#  triceps    thigh
# 6.825239 6.825239 
# Note that all VIFs are < 10, but they are considerably larger than 1.
# Therefore, we see some collinearity problem.
cor(thigh,triceps)      #cor=0.9238425

# Now, suppose we include all 3 predictors into the model.
results3=lm(fat~triceps+thigh+midarm)
vif(results3)	
#  triceps    thigh   midarm
# 708.8429 564.3434 104.6060 			
# Note how large the VIFs, indicating strong collinearity.

summary(lm(midarm~triceps+thigh))$r.sq	   #r.sq=0.9904403
# Note how big R^2 is, indicating the strong linear correlation
# between midarm and the other 2 predictors.

## Surgical Unit Example

data2=read.csv("SurgicalUnit.csv",header=T)
attach(data2)

y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
x5=age
x6=gender
x7=moderate
x8=heavy
log.y=log(y)

results2=lm(log.y~x1+x2+x3+x8)
vif(results2)		# All VIFs are small. Therefore, we don't have collinearity problem.


##########################################
### Model Building - Remedial Measures ###
##########################################

## Method of Weighted Least Squares
# Minimize Q=sum(wi*(residuals^2)), where wi=1/sigma_i^2

## Using Robust Regression to dampen the effect of influential cases

# 1. Least Absolute Residuals (LAR) or Least Absolute Deviations (LAD) Regression
# min(sum(abs(residuals)))

data=read.csv("BodyFat.csv",header=T)
attach(data)
results.fat=lm(fat~triceps+thigh,data=data)

library(quantreg)
rq(fat~triceps+thigh,data=data)

# 2. Least Median of Squares (LMS) Regression
# min(median(residuals^2))

## Nonparametric Regression: Lowess (locally weighted regression scatter plot smoothing) Method

# 1. The linear regression is weighted to give cases further from the middle X level 
#  in each neighborhood smaller weights
# 2. To make the procedure robust to outlying observations, the linear regression fitting
#	is repeated, with the weights revised so that cases that had large residuals in the
#	fitting receive smaller weights in the second fitting
# 3. To improve the robustness of the procedure further, step 2 is repeated one or more
#	times by revising the weights according to the size of the residuals in the latest
#	fitting

lowess(triceps,fat)

plot(lowess(triceps,fat),type='l')
points(triceps,fat)

par(mfrow=c(2,2))
plot(lm(fat~triceps))


###########################################
### Autocorrelation in Time Series Data ###
###########################################

u=c(0, 0.5, -0.7, 0.3, 0, -2.3, -1.9, 0.2, -0.3, 0.2, -0.1)
x=0:10

# epsilon_0 = 3
eps=array(3,11)
for(i in 2:11)
{
eps[i]=eps[i-1]+u[i]
}
y=2+0.5*x+eps
data.frame(u,eps,y)

plot(x,eps)

plot(x,y,ylim=c(0,10))
abline(a=2,b=.5,col="darkgreen")
abline(lm(y~x))

# epsilon_0 = -.2
u=c(0,rnorm(10))
eps=array(-.2,11)
for(i in 2:11)
{
eps[i]=eps[i-1]+u[i]
}
y=2+0.5*x+eps
data.frame(u,eps,y)

plot(x,eps)

plot(x,y,ylim=c(0,10))
abline(a=2,b=.5,col="darkgreen")
abline(lm(y~x))


## Blaisdell Company Example
data=read.csv("BlaisdellCompany.csv",header=T)
attach(data)

results=lm(Company~Industry)
summary(results)$coef
#               Estimate Std. Error    t value     Pr(>|t|)
# (Intercept) -1.4547500 0.21414605  -6.793261 2.314419e-06
# Industry     0.1762828 0.00144474 122.016981 1.013155e-27

anova(results)$Mean[2]          # MSE = 0.007405683

# Calculating the Durbin-Watson Statistics
e=results$residuals
D=sum(diff(e)^2)/sum(e^2)       # D=0.7347256 (Compare with critical values from table B.7)

library(car)
durbinWatsonTest(results)
#  lag Autocorrelation D-W Statistic p-value
#    1       0.6260046     0.7347256       0
#  Alternative hypothesis: rho != 0

durbinWatsonTest(results,alternative="positive")
#  lag Autocorrelation D-W Statistic p-value
#    1       0.6260046     0.7347256       0
#  Alternative hypothesis: rho > 0

# Or you can use 'dwtest' from 'lmtest' package
dwtest(results, alternative="greater")
#  	Durbin-Watson test
# data:  results
# DW = 0.7347, p-value = 0.0001748
# alternative hypothesis: true autocorrelation is greater than 0

### Remedial Measures for Autocorrelation

## Cochrane-Orcutt Procedure
n=length(e)
t=2:n
as.table(cbind(
  '(1)' = e,
  '(2)' = c(NA, e[t-1]),
  '(3)' = c(NA, e[t-1] * e[t]),
  '(4)' = c(NA, e[t-1]^2))) 
#              (1)           (2)           (3)           (4)
# 1  -0.0260518571                                          
# 2  -0.0620154481 -0.0260518571  0.0016156176  0.0006786993
# 3   0.0220209610 -0.0620154481 -0.0013656398  0.0038459158
# 4   0.1637542388  0.0220209610  0.0036060257  0.0004849227
# 5   0.0465704946  0.1637542388  0.0076261159  0.0268154507
# 6   0.0463765906  0.0465704946  0.0021597808  0.0021688110
# 7   0.0436170636  0.0463765906  0.0020228107  0.0021507882
# 8  -0.0584354347  0.0436170636 -0.0025487821  0.0019024482
# 9  -0.0943990257 -0.0584354347  0.0055162481  0.0034147000
# 10 -0.1491424634 -0.0943990257  0.0140789032  0.0089111760
# 11 -0.1479908977 -0.1491424634  0.0220717270  0.0222434744
# 12 -0.0530535560 -0.1479908977  0.0078514434  0.0219013058
# 13 -0.0229282395 -0.0530535560  0.0012164246  0.0028146798
# 14  0.1058516073 -0.0229282395 -0.0024269910  0.0005257042
# 15  0.0854637991  0.1058516073  0.0090464805  0.0112045628
# 16  0.1061022402  0.0854637991  0.0090679005  0.0073040610
# 17  0.0291124001  0.1061022402  0.0030888909  0.0112576854
# 18  0.0423164641  0.0291124001  0.0012319338  0.0008475318
# 19 -0.0441602515  0.0423164641 -0.0018687057  0.0017906831
# 20 -0.0330086858 -0.0441602515  0.0014576719  0.0019501278

e.y=e[t]; e.x=e[t-1]
temp=lm(e.y~0+e.x)
coef(temp)                # r=0.6311636 

# Or 
r=sum(e.y*e.x)/sum(e.x^2)   # r=0.6311636

y=Company; x=Industry
y.p=y[t]-r*y[t-1]
x.p=x[t]-r*x[t-1]

results.p=lm(y.p~x.p)
summary(results.p)$coef
              Estimate  Std. Error   t value     Pr(>|t|)
(Intercept) -0.3941105 0.167229910 -2.356699 3.069370e-02
x.p          0.1737583 0.002956708 58.767478 4.432471e-21

durbinWatsonTest(results.p,alternative="positive")
#  lag Autocorrelation D-W Statistic p-value
#    1       0.1473569      1.650248   0.154
#  Alternative hypothesis: rho > 0

dwtest(results.p,alternative="greater")
# Durbin-Watson test
# data:  results.p
# DW = 1.6502, p-value = 0.1517
# alternative hypothesis: true autocorrelation is greater than 0

# Note: You can repeat this process if needed up to 3 iterations.

b0=(-.3941105)/(1-.6311636)       # -1.068524
b1=0.1737583

# Company = -1.0685 + 0.17376*Industry

se.b0=0.16723/(1-.6311636)        # 0.4533989
se.b1=0.002956708

anova(results.p)$Mean[2]          # MSE = 0.0045(estimate of variance of disturbance, u_t)

# Note: D is approximately equal to 2(1-r)

## Hildreth-Lu Procedure

hildreth.lu=function(rho, y, x)
{
t=2:length(y)
y.p=y[t]-rho*y[t-1]
x.p=x[t]-rho*x[t-1]

results.p=lm(y.p~x.p)
sse=anova(results.p)$Sum[2]
return(sse)
}

hildreth.lu(.10,Company,Industry)

rho=seq(0,1,by=.01)
m=length(rho)
SSE=array(0,m)

for(i in 1:m){
SSE[i]=hildreth.lu(rho[i],Company,Industry)}
data.frame(rho,SSE)

plot(rho,SSE,type='l')
rho[which(SSE==min(SSE))]      # 0.96

y.lu=y[t]-0.96*y[t-1]
x.lu=x[t]-0.96*x[t-1]
results.lu=lm(y.lu~x.lu)
summary(results.lu)$coef
             Estimate  Std. Error   t value     Pr(>|t|)
(Intercept) 0.0711732 0.057974923  1.227655 2.363049e-01
x.p         0.1604536 0.006840127 23.457688 2.177123e-14

anova(results.lu)$Mean[2]     # 0.004215952

dwtest(results.lu,alternative="greater")
# Durbin-Watson test
# data:  results.lu
# DW = 1.7254, p-value = 0.2816
# alternative hypothesis: true autocorrelation is greater than 0

# Note: This procedure does not require any iterations once
# an estimate of rho is obtained.

b0=0.0711732/(1-.96)     # 1.77933
b1=0.1604536

# Company = 1.77933 + 0.1604536

se.b0 = 0.057974923/(1-.96)      # 1.449373
se.b1 = 0.00684

## First Differences Procedure (Use rho = 1)
y=Company; x=Industry
n=length(y)
t=2:n
y.diff=y[t]-y[t-1]      # Or y.dff=diff(y)
x.diff=x[t]-x[t-1]
results.diff=lm(y.diff~0+x.diff)
summary(results.diff)$coef
#         Estimate  Std. Error  t value     Pr(>|t|)
# x.diff 0.1684878 0.005096004 33.06272 1.435445e-17

anova(results.diff)$Mean[2]       # 0.00481522

dwtest(results.diff,alternative="two.sided")
# Durbin-Watson test
# data:  results.diff
# DW = 1.7389, p-value = 0.6558
# alternative hypothesis: true autocorrelation is not 0

b0=mean(y)-0.1684878*mean(x)    # 1.128648
b1=0.1684878

# Company = -0.3040115 + 0.1684878*Industry

# Forecasting

sqrt(0.0045*(1+1/20+(66.929-mean(x.p))^2/sum((x.p-mean(x.p))^2)))


########################################
### (11/30/16) Nonlinear Regression  ###
########################################

# 1. Exponential Regression Models

# Example of Type I
curve(5*exp(x/2),0,10)

x=sample(1:10,100,replace=T)
y=5*exp(x/2)+rnorm(100,mean=0,sd=20)
plot(x,y)
nls(y~b*exp(a*x),start=list(a=1,b=3))

curve(4.8003*exp(x*0.5049),0,10,add=T,col="darkred",lwd=2)

data=read.csv("InjuredPatients.csv",header=T)
attach(data)
Y=index; X=days
results=nls(Y~b*exp(a*X),start=list(a=0,b=50))

plot(X,Y,xlab="Days Hospitalized",ylab="Prognostic Index")
curve(58.60653*exp(-0.03959*x),0,70,add=T,col="darkred",lwd=2)

Y.fitted=58.60653*exp(-0.03959*X)
residuals=Y-Y.fitted
plot(Y.fitted,residuals,main="Residual Plot")


# Or you can also use the 'glm' function
log.lin <- glm(Y~X, family=quasi(link="log", variance="constant"))
coef(log.lin)
# (Intercept)           X 
#  4.07084672 -0.03958645     # Note: exp(4.07084672)=58.60656


# Example of Type II
curve(100-50*exp(-2*x),0,2)

x2=runif(100,0,2)
y2=100-50*exp(-2*x2)+rnorm(100,mean=0,sd=2)
plot(x2,y2)

results2=nls(y2~a+b*exp(c*x2),start=list(a=1,b=-1,c=-1))
round(coef(results2),2)
#      a      b      c
# 100.32 -50.05  -1.98 

curve(100.32-50.05*exp(x*(-1.98)),0,2,add=T,col="darkred",lwd=2)

# 2. Logistic Regression Models

curve(10/(1+20*exp(-2*x)),0,4,lwd=2)

x=sample(0:5,100,replace=T)
y=10/(1+20*exp(-2*x))+rnorm(100,mean=0,sd=.5)
plot(x,y)

nls(y~c/(1+b*exp(a*x)),start=list(a=-2,b=20,c=10))
curve(10.131/(1+16.623*exp(-1.855*x)),0,5,col="darkred",lwd=2,add=T)


# Problem 13.10: Enzyme Kinetics

data.kinetics=read.csv("data_kinetics.csv",header=T)
attach(data.kinetics)
y=velocity; x=concentration
y.p=1/y
x.p=1/x
results=lm(y.p~x.p) #bo=0.03376 and b1=0.45401

nls(y~a*x/(b+x),start=list(a=1/0.03376,b=0.454/0.03376))
plot(concentration,velocity)
curve(28.14*x/(12.57+x),0,40,add=T,lwd=2)

y.fitted=28.14*x/(12.57+x)
residuals=y-y.fitted
plot(y.fitted,residuals)


############################################
### (11/25/12) Generalized Linear Models ###
############################################

# 1. Probit Mean Response Function

curve(pnorm(0+1*x),-3,3,lwd=2)
curve(pnorm(0+2*x),-3,3,lwd=2,col="red",add=T)

curve(dnorm(x,0,1),-3,3)
curve(exp(x)/(1+exp(x))^2,-3,3,add=T,col="red")
curve(dnorm(x,0,sd=pi/sqrt(3)),-3,3,add=T,col="blue")


# Simulation example:
n=1000
x=sample(0:10,n,replace=T)
line=0.8*x-4
pies=pnorm(line)
plot(x,pies)

y=(runif(n)<pies)
proportions=tapply(y,x,mean)
plot(0:10,proportions)

# Using probit link
results.sim <- glm(y ~ x, family=binomial(link="probit"))
coef(results.sim)

curve(pnorm(-3.703+0.7256*x),0,10,add=T)

# Using logit link
results2.sim <- glm(y ~ x, family=binomial(link="logit"))
coef(results2.im)

plot(0:10,proportions)
curve(exp(-6.77+1.322*x)/(1+exp(-6.77+1.322*x)),0,10,add=T)
curve(pnorm(-3.703+0.7256*x),0,10,add=T,col="darkred")

# Task Example (on page 565)
data=read.csv("Task.csv",header=T)
attach(data)
x=experience
y=success

results <- glm(success ~ experience, family=binomial(link="probit"))
results <- glm(success ~ experience, family=binomial(link="logit"))
coef(results)
summary(results)$coef

predicted=exp(-3.0597+0.1615*x)/(1+exp(-3.0597+0.1615*x))

# Interpretation of beta_1:
# When x increases by 1 unit, the odds=(pi/(1-pi)) will change 
# by a factor of exp(b1)=exp(.1615)=1.175272. In other words, it will
# increase by 17.5%.

# Prediction
prediction=as.numeric(results$fitted>.7)  	# Using 0.7 as cut off
error=success-prediction
data.frame(success,predicted=results$fitted,prediction,error)

percent.error=sum(abs(error))/length(success)
percent.error

false.pos=sum((success==0)*(prediction==1))
false.neg=sum((success==1)*(prediction==0))

# Finding the best cut off value
cutoff=seq(0,1,by=.05)
m=length(cutoff)
n=length(success)
False.pos=array(99,m)
False.neg=array(99,m)
Errors=array(99,m)

for(i in 1:m)
{
prediction=as.numeric(results$fitted>cutoff[i])		
error=success-prediction
False.pos[i]=sum((success==0)*(prediction==1))
False.neg[i]=sum((success==1)*(prediction==0))
Errors[i]=False.pos[i]+False.neg[i]
}
Percent.error=Errors/n
data.frame(CutOff=cutoff,FalsePositive=False.pos,FalseNegative=False.neg,ErrorRate=Percent.error)

# Hosmer-Lemeshow Goodness of Fit Test
library(ResourceSelection)
hoslem.test(success,results$fitted)

###############################################################
### (11/27/12) Multiple logistic regression and Diagnostics ###
###############################################################
# Multiple logistic regression

dystrophy=read.csv("Dystrophy.csv",header=T)
attach(dystrophy)

dystrophy[!complete.cases(dystrophy),]   # lists rows with missing values
dys.naomit=na.omit(dystrophy)  		# creates new data set without missing data
temp=which(is.na(PK)=="TRUE")

results.dys <- glm(carrier ~ AGE + M + CK + H + PK + LD, data = dystrophy, family =binomial(link="logit"))
results.dys <- glm(carrier ~ AGE + M + CK + H + PK + LD, data = dys.naomit, family =binomial(link="logit"))

carrier.naomit=dys.naomit[,10]
hoslem.test(carrier.naomit,results.dys$fit)