###################################### ### Lecture R commands - Fall 2016 ### ###################################### ################################################# ### Day 1 - September 7 - Writing R functions ### ################################################# ss.xx=function(x){sum(x^2)-(sum(x))^2/length(x)} ss.yy=function(y){sum(y^2)-(sum(y))^2/length(y)} # Equivalent to ss.xx(y) ss.xy=function(x,y){sum(x*y)-sum(x)*sum(y)/length(x)} data.jog=read.csv("Jogging.csv",header=T) attach(data.jog) ssxx=ss.xx(Distance) # 19.45455 ssyy=ss.xx(Time) # 3025.091 ssxy=ss.xy(Distance,Time) # 241.6364 r=ssxy/sqrt(ssxx*ssyy) # 0.9960532 corr=function(x,y) { ss.xy=function(x,y){sum(x*y)-sum(x)*sum(y)/length(x)} ssxy=ss.xy(x,y) ssxx=ss.xy(x,x) ssyy=ss.xy(y,y) r=ssxy/sqrt(ssxx*ssyy) list(corr=r,SSxx=ssxx,SSyy=ssyy,SSxy=ssxy) } corr(Distance,Time) ########################################## ### Day 2 - September 9 - Correlation ### ########################################## # Tree Height Example data.tree=read.csv("TreeHeight.csv",header=T) head(data.tree) #Lists the variable names and their first 6 values attach(data.tree) source("C:\\Math445_Fall2013\\corr.R") #Sourcing the 'corr.R' function we created corr(Circumferenc,Height) #Using the 'corr' function cor(Circumference,Height) #This is the built-in correlation function # Jogging Example data.jog=read.csv("Jogging.csv",header=T) head(data.jog) attach(data.jog) plot(Distance,Time) corr(Distance,Time) cor(Distance,Time) ######################################################## ### Day 3 - September 12 - Simple Linear Regression ### ######################################################## # Jogging Example data.jog=read.csv("Jogging.csv",header=T) attach(data.jog) plot(Distance,Time) #Source in 'Corr' function output=corr(Distance,Time) attributes(output) output ssxy=output$SSxy ssxx=output$SSxx b1=ssxy/ssxx b0=mean(Time)-b1*mean(Distance) plot(Distance,Time) abline(b0,b1,lwd=2) # lwd=2 creates a thicker line results=lm(Time~Distance) attributes(results) results$coef # Or use 'coef(results)' plot(Distance,Time) abline(results) # Estimating sigma attributes(results) residuals=results$res sse=sum(residuals^2) # sse=23.83178 n=length(Distance) mse=sse/(n-2) # mse=1.19 sigma.error=sqrt(mse) # 1.09 predicted=results$fit data.frame(Distance,Time,Predicted=predicted,Residuals=round(residuals,3)) temp=which(abs(residuals)>1.5) Time[temp] summary(results) #################################################### ### Day 4 - September 14 - Inference about slope ### #################################################### # Production Run Example size=c(30,20,60,80,40,50,60,30,70,60) hours=c(73,50,128,170,87,108,135,69,148,132) lm.size.hours=lm(hours~size) coef(lm.size.hours) plot(size,hours) abline(lm.size.hours) # Estimating sigma attributes(lm.size.hours) residuals=lm.size.hours$res sse=sum(residuals^2) # sse=60 n=length(size) mse=sse/(n-2) # mse=7.5 sigma.error=sqrt(mse) # 2.738613 predicted=lm.size.hours$fitted data.frame(Size=size,Hours=hours,Predicted=predicted,Residuals=round(residuals,3)) which(abs(residuals)>2*sigma.error) summary(lm.size.hours) anova(lm.size.hours) sum(residuals) #Verifying property #1 sum(hours);sum(predicted) #Verifying property #3 sum(size*residuals) #Verifying property #4 sum(predicted*residuals) #Verifying property #5 10+2*mean(size);mean(hours) #Verifying property #6 # Inference concerning beta_1 # Source in 'corr.R' n=length(size) SSxx=corr(size,hours)$SSxx # 3400 se.b1=sqrt(mse/SSxx) # 0.04696682 t.b1=2/se.b1 # 42.58325 pvalue.b1=2*pt(-42.58325,df=n-2) confint(lm.size.hours) ############################################################# ### Day 5 - September 16 - Prediction and Confidence Band ### ############################################################# size=c(30,20,60,80,40,50,60,30,70,60) hours=c(73,50,128,170,87,108,135,69,148,132) lm.size.hours=lm(hours~size) coef(lm.size.hours) plot(size,hours) abline(lm.size.hours) # Confidence intervals for the mean response new=data.frame(size=100) predict(lm.size.hours,newdata=new) # the predicted value of a new observation # This will give the mean C.I. for the new data predict(lm.size.hours,newdata=new,interval="confidence") new2=data.frame(size=c(90,100)) # new2 contains only the 2 new observations # This will give the mean C.I. for the 2 new data predict(lm.size.hours,newdata=new2,interval="confidence") # Prediction interval for future values predict(lm.size.hours,newdata=new2,interval="prediction") # new2 contains only the 2 new observations predict(lm.size.hours,newdata=new2,interval="prediction",level=.99) # The default level is 0.95 # Confidence Band lm.size.hours=lm(hours~size) CI=predict(lm.size.hours,se.fit=TRUE) # se.fit=SE(mean) W=sqrt(2*qf(0.95,2,8)) band.lower=CI$fit - W*CI$se.fit band.upper=CI$fit + W*CI$se.fit band=cbind(band.lower,band.upper) plot(size, hours, xlab="Production Size", ylab="Work Hours", main="Confidence Band") abline(lm.size.hours) points(sort(size), sort(band.lower), type="l", lty=2) points(sort(size), sort(band.upper), type="l", lty=2) # Or alternatively, you can use the following command: # source in 'confidence.band.R' confidence.band(lm.size.hours) # Prediction Band mse=anova(lm.size.hours)$Mean[2] se.pred=sqrt(CI$se.fit^2+mse) band.lower.pred=CI$fit - W*se.pred band.upper.pred=CI$fit + W*se.pred points(sort(size),sort(band.lower.pred),type="l",lwd=2,lty=2,col="Red") points(sort(size),sort(band.upper.pred),type="l",lwd=2,lty=2,col="Red") ############################################################################## ### Day 6 - September 19 - ANOVA Approach and Coefficient of Determination ### ############################################################################## size=c(30,20,60,80,40,50,60,30,70,60) hours=c(73,50,128,170,87,108,135,69,148,132) lm.size.hours=lm(hours~size) residuals=lm.size.hours$residuals predicted=lm.size.hours$fitted n=length(size) ssto=sum((hours-mean(hours))^2) # ssto=13660 sse=sum(residuals^2) # sse=60 ssr=sum((predicted-mean(hours))^2) # ssr=13600, or ssr=ssto-sse msr=ssr/1 # msr=13600 mse=sse/(n-2) # mse=7.5 f.obs=msr/mse # f.obs=1813.333 p.value=pf(f.obs,df1=1,df2=n-2,lower.tail=F) # p.value=1.019588e-10 # Alternatively, you can use the 'anova' function anova(lm.size.hours) plot(size,(predicted-mean(hours))) points(size,residuals,pch=19) # This will superimpose these points to the plot R2=ssr/ssto # R2=0.9956076 # Therefore, about 99.56% of the variability of Hours (Y) can be explained by the # regression line. ## Test for Correlations cor(size,hours) # Computes the Pearson correlation coefficient, r cor.test(size,hours) # Tests Ho:rho=0 and also constructs C.I. for rho cor(size,hours,method="spearman") # Computes the Spearman's correlation coefficient cor.test(size,hours,method="spearman") # Test of independence using the Spearman Rank correlation cor(size,hours,method="kendall") # Computes Kendall's Tau cor.test(size,hours,conf.level=.99) # Tests Ho:rho=0 and also constructs C.I. for rho ################################## ### September 23 - Diagnostics ### ################################## rm(list=ls()) # Clears the workspace data=read.csv("Toluca.csv",header=T) attach(data) # Diagnostics for Predictor Variable table(size) # Creates a frequency table barplot(table(size)) # Creates a frequency bargraph boxplot(size) # Creates a boxplot (good for outlier detection) size.out=c(size,200) boxplot(size.out) # Note the presence of the outlier plot(size,type="b") # Creates a time/sequence plot hist(size) # Creates a histogram bin=seq(15,125,by=10) hist(size,breaks=bin,col="gray") stem(size) # Creates a stem-and-leaf-plot stem(size,scale=3) # Dot plots stripchart(size, method = "stack", offset = 1, at = 0, pch = 19, main = "Dotplot", xlab = "Lot Size") plot(size, hours) results=lm(hours~size) abline(results,col="blue") ## Departures from the model ## ## 1. The regression function is not linear. # Plot residuals againsts predictor OR residuals againsts fitted values plot(size,results$residuals) # Note that the residual plot doesn't show any systematic pattern. plot(results$fitted,results$residuals) # Again, no systematic pattern. mse=anova(results)$Mean[2] # use=2384 e.star=results$residuals/sqrt(mse) plot(results$fitted,e.star) # A non-linear example z=rnorm(length(size),0,100) x=size y=x^2+z plot(x,y) # Note that the relationship is curvilinear cor(x,y) # Note that r is pretty high! temp=lm(y~x) plot(temp$fitted,temp$res) # Note the pattern in this residual plot x2=x^2 temp2=lm(y~x2) plot(temp2$fitted,temp2$res) ## 2. The error terms are not normally distributed. # Plots of residuals againsts omitted predictor variables # Box plot of residuals # Normal probability plots of residuals qqnorm(results$residuals) qqline(results$residuals) shapiro.test(results$residuals) qqnorm(temp$residuals) qqline(temp$residuals) shapiro.test(temp$residuals) ## 3. The error terms do not have constant variance. # Plot of absolute or squared residuals against predictor variable z=rnorm(length(size),0,100*sqrt(size)) x=size y=100+25*x+z plot(x,y) # Note the increasing variation temp3=lm(y~x) plot(temp3$fitted,temp3$res) # Note the increasing variation ## 4. The error terms are not independent. # Sequence plot of the residuals # Plot of residuals againsts time or other sequence ## 5. The model fits all but one or a few outlier observations. # Plot residuals againsts predictor OR residuals againsts fitted values # Box plots, stemplots, dot plots of residuals ## 6. One or several important predictor variables have been omitted from the model. ## Brown-Forsythe Test bf=function(x,y,x.med=median(x)){ results=lm(y~x) e=results$residuals # x.med=median(x) e1=e[x<=x.med]; n1=length(e1) e2=e[x>x.med]; n2=length(e2) d1=abs(e1-median(e1)) d2=abs(e2-median(e2)) s2=(sum((d1-mean(d1))^2)+sum((d2-mean(d2))^2))/(n1+n2-2) t.bf=(mean(d1)-mean(d2))/(sqrt(s2)*sqrt(1/n1+1/n2)) p.val=2*pt(abs(t.bf),df=(n1+n2-2),lower.tail=F) list(t.bf=t.bf,p.val=p.val)} # Example: Toluca data # Source in the function bp.R bf(size,hours) # t_obs=1.3165 and p-value=0.201 # Example: Simulated data with increasing variance x=sample(seq(20,120,by=10),100,replace=T) z=rnorm(length(x),0,200*sqrt(x)) y=(62.37+3.57*x)+z plot(x,y) # Note the increasing variation bf(x,y) ## Breusch-Pagan Test (Also known as Cook-Weisberg test) # load the package 'lmtest' library(lmtest) bptest(hours~size,studentize=F) Example 2: x=1:50 y=(1:50)*rnorm(50) bf(x,y) bptest(y~x) ################################################ ### Day 10 - September 28 - Lack of Fit Test ### ################################################ # Example: Bank data (page 120) x=c(125,100,200,75,150,175,75,175,125,200,100) y=c(160,112,124,28,152,156,42,124,150,104,136) results=lm(y~x) anova(results) reduced.mod=lm(y~x) full.mod=lm(y~factor(x)) anova(reduced.mod,full.mod) >anova(results) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x 1 5141.3 5141.3 3.1389 0.1102 Residuals 9 14741.6 1638.0 > anova(reduced.mod,full.mod) Analysis of Variance Table Model 1: y ~ x Model 2: y ~ factor(x) Res.Df RSS Df Sum of Sq F Pr(>F) 1 9 14742 2 5 1148 4 13594 14.801 0.005594 ** ############################################### ### Day 11 - September 30 - Transformations ### ############################################### # Transforming the X variable # Example (page 129) x=c(.5,.5,1,1,1.5,1.5,2,2,2.5,2.5) y=c(42.5,50.6,68.5,80.7,89,99.6,105.3,111.8,112.3,125.7) results=lm(y~x) summary(results) # R^2=0.9256 plot(x,results$res) # note the curve pattern qqnorm(results$res) # no evidence of non-normality of residuals bf(x,y) # no evidence of deviation xp=sqrt(x) results=lm(y~xp) summary(results) # R^2=0.9545 - note the improvement plot(xp,results$res) qqnorm(results$res) # Transforming the Y variable # Example: Plasma (page 132) data=read.csv("Plasma.csv",header=T) attach(data) plot(Age,Plasma) results=lm(Plasma~Age) plot(Age,results$res) qqnorm(results$res) log10plasma=log(Plasma,base=10) plot(Age,log10plasma) results2=lm(log10plasma~Age) plot(Age,results2$res) qqnorm(results2$res) ## Box Cox Transformation y=Plasma;x=Age lambda=seq(-2,2,by=.1) lambda[21]=0.01 # Just to avoid the special case, lambda=0 iter=length(lambda) SSEs=array(0,iter) # Storage for the SSEs k2=(prod(y))^(1/length(y)) for(i in 1:iter){ k1=1/(lambda[i]*k2^(lambda[i]-1)) w=k1*(y^lambda[i]-1) results=lm(w~x) SSEs[i]=anova(results)$Sum[2]} w=k2*(log(y)) # This is for the special case when lambda=0 results=lm(w~x) SSEs[21]=anova(results)$Sum[2] lambda[21]=0 cbind(lambda,SSEs) # Choose lambda that minimizes the SSE plot(lambda,SSEs) # load package 'MASS' library(help="MASS") boxcox(y~x) # Check the resulting plot # Lowess Method data.mm=read.csv("MuscleMass.csv",header=T) attach(data.mm) plot(age,mass) lines(lowess(age,mass)) ############################################ ### October 3 - Simultaneous Inferences ### ############################################ ## Bonferroni Joint Estimation of beta_0 and beta_1 data=read.csv("Toluca.csv",header=T) attach(data) results=lm(hours~size) confint(results,level=.90) coefficients(summary(results)) b0=coefficients(summary(results))[1,1] # Extracting the value of b0 se.b0=coefficients(summary(results))[1,2] # The standard error of b0 b1=coefficients(summary(results))[2,1] # Extracting the value of b1 se.b1=coefficients(summary(results))[2,2] # The standard error of b1 # The 90% joint confidence intervals for beta_0 and beta_1: B=qt(1-.10/(2*2),df=23) # In general, B=qt(1-.10/(2*k),df), k=no. of intervals lower.b0=b0-B*se.b0;upper.b0=b0+B*se.b0 lower.b1=b1-B*se.b1;upper.b1=b1+B*se.b1 # Or you can make the Bonferroni adjustment here confint(results,level=(1-.10/2)) # In general, confint(results,level=(1-alpha/k)) ## Simultaneous Estimation of Mean Responses new=data.frame(size=c(30,65,100)) ci=predict(results,new,interval="confidence",se.fit=T) # Using Working-Hotelling Procedure # to constuct 90% joint confidence intervals for the mean response at the 3 x levels. W2=2*qf(1-.10,2,23) ci$fit[1]-sqrt(W2)*ci$se[1]; ci$fit[1]+sqrt(W2)*ci$se[1] # at size=30 ci$fit[2]-sqrt(W2)*ci$se[2]; ci$fit[2]+sqrt(W2)*ci$se[2] # at size=65 ci$fit[3]-sqrt(W2)*ci$se[3]; ci$fit[3]+sqrt(W2)*ci$se[3] # at size=100 # Using Bonferroni Procedure B=qt(1-.10/(2*3),23) ci$fit[1]-B*ci$se[1]; ci$fit[1]+B*ci$se[1] # at size=30 ci$fit[2]-B*ci$se[2]; ci$fit[2]+B*ci$se[2] # at size=65 ci$fit[3]-B*ci$se[3]; ci$fit[3]+B*ci$se[3] # at size=10 # Or you can make the Bonferroni adjustment here predict(results,new,interval="confidence",level=(1-.10/3)) ## Simultaneous Estimation of Mean Responses data=read.csv("data_Toluca.csv",header=T) attach(data) results=lm(hours~size) MSE=anova(results)$Mean[2] new=data.frame(size=c(80,100)) ci=predict(results,new,interval="prediction",se.fit=T) se.fit.pred=sqrt(MSE*(1+(ci$se.fit^2/MSE))) # Scheffe Procedure S=sqrt(2*qf(1-.05,2,23)) # In general, use S=sqrt(k*qf(1-.05,k,n-2)) # If k=2:10, look at plot(k,sqrt(k*qf(1-.05,k,n-2))) lwr.scheffe=ci$fit[,1]-S*se.fit.pred upr.scheffe=ci$fit[,1]+S*se.fit.pred cbind(ci$fit,lwr.scheffe,upr.scheffe) # Bonferroni Procedure ci.bon=predict(results,new,interval="prediction",level=(1-.05/2)) ## Regression through Origin data=read.csv("Warehouse.csv",header=T) # Example in the book (page 162) attach(data) n=length(cost) b1=sum(units*cost)/sum(units^2) fitted=b1*units residuals=cost-fitted SSE=sum(residuals^2) MSE=SSE/(n-1) se.b1=sqrt(MSE/sum(units^2)) # The 95% C.I. for beta_1 is lwr=b1-qt(.975,n-1)*se.b1 upr=b1+qt(.975,n-1)*se.b1 # Or you can use the following R command: results=lm(cost~0+units) summary(results) confint(results) anova(results) ##################################### ### October 12 - Matrix Approach ### ##################################### A=matrix(1:6,nrow=2) B=matrix(c(-1,0,2,1,4,3),nrow=2) A B A+B # Matrix addition = addition of corresponding entries A-B t(A) # transpose of A D=t(A)%*%A # To do matrix multiplication, use %*% E=A%*%t(A) # Note that E is not equal to D. E.inv=solve(E) # Computing the inverse of a square matrix I=E.inv%*%E # The product of E and it's inverse is the Identity matrix data=read.csv("Toluca.csv",header=T) attach(data) x=size; y=hours n=length(x) t(y)%*%y # Sum of squares of yi's X=matrix(c(rep(1,n),x),ncol=2) # The design matrix X t(X)%*%X t(X)%*%y solve(t(X)%*%X) B=solve(t(X)%*%X)%*%(t(X)%*%y) # Estimates of the regression coefficients coefficients(lm(y~x)) # Note that you get these results fitted=X%*%B ######################### ## Multiple Regression ## ######################### data=read.csv("Dwaine.csv",header=T) # Dwaine Studios example on page 236 attach(data) y=sales # Sales of portraits of children in a community x1=young # Number of persons aged 16 or younger in the community x2=income # Per capita disposable personal income in the community n=length(sales) pairs(cbind(x1,x2,y)) # Creates a scatter plot matrix pairs(data) cor(data) # Creates a correlation matrix X=matrix(c(rep(1,n),x1,x2),ncol=3) # The design matrix X t(X)%*%X B=solve(t(X)%*%X)%*%(t(X)%*%y) # Estimates of the regression coefficientspair #Or we can use results=lm(y~x1+x2) summary(results) # Diagnostics par(mfrow=c(2,2)) plot(results$fitted,results$residuals) plot(x1,results$residuals) plot(x2,results$residuals) plot(x1*x2,results$residuals) # Checking for interaction effects plot(results$fitted,abs(results$residuals)) # Checking constant variance qqnorm(results$residuals) # Checking normality shapiro.test(results$residuals) anova(results) # F Test for Regression Relation # Coefficient of multiple determination # Adjusted coefficient of multiple determination # Coefficient of multiple correlation #Prediction new=data.frame(x1=65.4,x2=17.6) predict(results,new) predict(results,new,interval="confidence") predict(results,new,interval="prediction") new2=data.frame(x1=c(65.4,53.1),x2=c(17.6,17.7)) predict(results,new2,interval="prediction",level=(1-.10/2)) # Bonferroni Intervals # Health Exam Data Example data.health=read.csv("HealthExam.csv",header=T) head(data.health) attach(data.health) result2=lm(Weight~Waist+Height) summary(result2) Weight1=-201.5717+2.1565*80+2.5978*60 # 126.8163 Weight2=-201.5717+2.1565*90+2.5978*70 # 174.3593 predict(result2,newdata=data.frame(Waist=c(80,90),Height=c(60,70))) confint(result2,level=.99) # Model Assessments qqnorm(result2$res);qqline(result2$res) shapiro.test(result2$res) # W = 0.9884, p-value = 0.6898 plot(result2$fit,result2$res) ############################################## ### October 17 - More Multiple Regression ### ############################################## data.body=read.csv("BodyFat.csv",header=T) attach(data.body) y=fat x1=triceps x2=thigh x3=midarm results.12=lm(y~x1+x2) summary(results.12) anova(results.12) results.1=lm(y~x1) anova(results.1) # ssr(x2|x1)=ssr(x1,x2)-ssr(x1)=sse(x1)-sse(x1,x2) # f_obs=(ssr(x2|x1)/1)/(sse(x1,x2)/(n-3)) # Ho: beta_2=0 results.123=lm(y~x1+x2+x3) # ssr(x3|x1,x2)=ssr(x1,x2,x3)-ssr(x1,x2)=sse(x1,x2)-sse(x1,x2,x3) # f_obs=(ssr(x3|x1,x2)/1)/(sse(x1,x2,x3)/(n-4)) # Ho: beta_3=0 # ssr(x2,x3|x1)=sse(x1)-sse(x1,x2,x3)=ssr(x1,x2,x3)-ssr(x1) # f_obs=(ssr(x2,x3|x1)/2)/(sse(x1,x2,x3)/(n-4)) # Ho: beta_2=0 and beta_3=0 #Or you can use these commands: red=lm(y~x1) full=lm(y~x1+x2+x3) anova(red,full) # Partial F-Test for Ho: beta_2=0 and beta_3=0 ## Coefficient of Partial Determination (Oct. 19) anova(results.12) anova(results.1) # R^2(2|1)=ssr(x2|x1)/sse(x1)=33.17/143.12=0.232 # Hence, the SSE(x1) is reduced by 23.2 percent. anova(results.123) anova(results.12) # R^2(3|12)=ssr(x3|x1,x2)/sse(x1,x2)=11.55/109.95=0.105 ## Standardized Regression Model # We use this method when det(X'X) is close to zero OR # when explanatory variables differ substantially in order of magnitude data=read.csv("DwaineStudios.csv",header=T) attach(data) x1=young x2=income y=sales n=length(sales) x1.star=((x1-mean(x1))/sd(x1))/sqrt(n-1) x2.star=((x2-mean(x2))/sd(x2))/sqrt(n-1) y.star=((y-mean(y))/sd(y))/sqrt(n-1) results.star=lm(y.star~0+x1.star+x2.star) #coef(results.star) # x1.star x2.star #0.7483670 0.2511039 b1=(sd(y)/sd(x1))*0.7484 b2=(sd(y)/sd(x2))*0.2511 b0=mean(y)-b1*mean(x1)-b2*mean(x2) #check lm(y~x1+x2) ## Multicollinearity # Uncorrelated predictors x1=c(4,4,4,4,6,6,6,6) x2=c(2,2,3,3,2,2,3,3) cor(x1,x2) y=c(42,39,48,51,49,53,61,60) anova(lm(y~x1)) anova(lm(y~x2)) anova(lm(y~x1+x2)) # Note that ssr(x2)=ssr(x2|x1) # Perfectly correlated predictors x3=c(2,8,6,10) x4=c(6,9,8,10) cor(x3,x4) coefficients(lm(x4~x3)) y2=c(23,83,63,103) # y2_hat1=-87+x3+18*x4 # y2_hat2=-7+9*x3+2*x4 # y2_hat3=-17+8*x3+4*x4 # Note that all 3 models give perfect fit. data=read.csv("BodyFat.csv",header=T) attach(data) pairs(data) cor(data) summary(lm(midarm~triceps+thigh)) x1=triceps x2=thigh x3=midarm y=fat coefficients(lm(y~x1)) coefficients(lm(y~x2)) coefficients(lm(y~x1+x2)) # Note how the beta estimates drastically changes as you coefficients(lm(y~x1+x2+x3)) # include highly correlated predictors # SSR(x1|x2) and R^2(1|2) anova(lm(y~x2+x1)) # Note how small the marginal contribution of x1 when x2 is already in the model. # SSR(x1|x2) can sometimes be bigger than SSR(x1) x1=c(5,10,5,10) x2=c(25,30,5,10) y=c(20,20,0,1) temp=data.frame(y=y,x1=x1,x2=x2) cor(temp) anova(lm(y~x2)) anova(lm(y~x1+x2)) # x1 is a suppressor variable # Predictions are still good even with multicollinearity of predictors x1=triceps x2=thigh x3=midarm y=fat results=lm(y~x1) predict(results,new=data.frame(x1=25),interval="confidence",se.fit=T) results2=lm(y~x1+x2) predict(results2,new=data.frame(x1=25,x2=50),interval="confidence",se.fit=T) results3=lm(y~x1+x2+x3) predict(results3,new=data.frame(x1=25,x2=50,x3=29),interval="confidence",se.fit=T) library(car) vif(lm(y~x1+x2+x3)) ########################################### ### October 21 - Polynomial Regression ### ########################################### n=50 x=sample(20:40,n,replace=T) y=100-30*x+0.5*x^2 + rnorm(n,mean=0,sd=10) plot(x,y) x.2=x^2 cor(x,x.2) summary(lm(y~x+x.2)) xc=x-mean(x) xc.2=xc^2 cor(xc,xc.2) results=(lm(y~xc+xc.2)) summary(results) b0=coef(results)[1] b1=coef(results)[2] b2=coef(results)[3] b0.orig=b0-b1*mean(x)+b2*(mean(x))^2 b1.orig=b1-2*b2*mean(x) b2.orig=b2 curve(b0.orig+b1.orig*x+b2.orig*x^2,20,40,add=T) y.hat=b0.orig+b1.orig*x+b2.orig*x^2 # Or use, yhat=results$fitted residuals=y-y.hat # Or use, residuals=results$res # Testing lack of fit of the quadratic regression red=lm(y~xc+xc.2) full=lm(y~factor(xc)) anova(red,full) # Power Cell Example data=read.csv("PowerCells.csv",header=T) attach(data) cor(charge,charge^2) # Note how correlated they are. cor(temperature,temperature^2) y=cycles x1=(charge-mean(charge))/.4 # Transforming will reduce the correlation x2=(temperature-mean(temperature))/10 x1.2=x1^2 # Check cor(x1,x1.2) x2.2=x2^2 # Check cor(x2,x2.2) x1x2=x1*x2 summary(lm(y~x1+x1.2+x2+x2.2+x1x2)) # Note that the interaction effect is not significant. summary(lm(y~x1+x1.2+x2+x2.2)) # Note that x2^2 is not significant. summary(lm(y~x1+x1.2+x2)) # Note that x1^2 is not significant. summary(lm(y~x1+x2)) # Note how much R^2 changed as you remove variables from the model. Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 172.000 9.354 18.387 7.88e-08 *** x1 -55.833 12.666 -4.408 0.002262 ** x2 75.500 12.666 5.961 0.000338 *** # Changing back to the original data: b0.orig=172+55.833*mean(charge)/.4-75.5*mean(temperature)/10 # b0.orig=160.5825 b1.orig=-55.833/.4 # b1.orig=-139.5825 b2.orig=75.5/10 # b0.orig=7.55 y.hat=160.58-139.58*charge+7.55*temperature # Or you can test Ho:beta_11=beta_22=beta_12=0 red=lm(y~x1+x2) full=lm(y~x1+x1.2+x2+x2.2+x1x2) anova(red,full) Model 1: y ~ x1 + x2 Model 2: y ~ x1 + x1.2 + x2 + x2.2 + x1x2 Res.Df RSS Df Sum of Sq F Pr(>F) 1 8 7700.3 2 5 5240.4 3 2459.9 0.7823 0.5527 # Since the p-value=0.5527, we can conclude Ho, that no curvature and interaction effects are needed. ############################################## ### Regression with Qualitative Predictors ### ############################################## data=read.csv("Insurance.csv",header=T) # Example on page 316 attach(data) type=factor(type) # 0=Mutual; 1=Stock results=lm(months~size+type) summary(results)$coef Estimate Std. Error t value Pr(>|t|) (Intercept) 33.8740690 1.813858297 18.675146 9.145269e-13 size -0.1017421 0.008891218 -11.442990 2.074687e-09 type1 8.0554692 1.459105700 5.520826 3.741874e-05 plot(size,months,pch=as.numeric(type)) abline(33.874,-0.102,lwd=2) # For mutual firms abline(33.874+8.055,-0.102,lwd=2,col="darkred") # For stock firms legend(250,35,c("Mutual","Stock"),pch=1:2) # Interactions between quantitative and qualitative predictors results2=lm(months~size+type+size*type) summary(results2)$coef Estimate Std. Error t value Pr(>|t|) (Intercept) 33.8383694765 2.44064985 13.86449166 2.472768e-10 size -0.1015306249 0.01305254 -7.77861250 7.965637e-07 type1 8.1312501223 3.65405169 2.22526959 4.079375e-02 size:type1 -0.0004171412 0.01833121 -0.02275578 9.821265e-01 # Another example data=read.csv("Crop.csv",header=T) data=attach(data) mc=SoilMerCon # Mercury concentration in the soil (x1) cr=Crop # Type of crop (barley, corn, or wheat) (x2) pc=PlantMerCon # Mercury concentration in the plnat (y) plot(mc,pc,pch=as.numeric(cr)) # Will create scatterplots for the 3 different crops legend(1.5, 135, c("Barley","Corn","Wheat"), pch=1:3) tapply(pc[mc==6],cr[mc==6],max) # To determine which values are above mc.fac=factor(mc) # Declaring that mc is a qualitative variable results=lm(pc~mc.fac) summary(results) # another example data.fertilizer=read.csv("Fertilizer.csv",header=T) attach(data.fertilizer) plot(height,yield,pch=as.numeric(treatment)) # Will create scatterplots for the 3 different crops legend(55, 15.5, c("Control","Fast","Slow"), pch=1:3) summary(lm(yield~treatment+height+treatment:height))$coef summary(lm(yield~treatment*height))$coef Estimate Std. Error t value Pr(>|t|) (Intercept) 9.491768741 0.202459041 46.882415 4.084943e-25 treatmentf -3.519620102 0.350334683 -10.046451 4.489809e-10 treatments 3.906558043 0.295789635 13.207217 1.675966e-12 height 0.056614405 0.004265179 13.273628 1.506750e-12 treatmentf:height 0.006814587 0.006746322 1.010119 3.225144e-01 treatments:height -0.006886936 0.006084213 -1.131935 2.688450e-01 summary(lm(yield~treatment+height))$coef Estimate Std. Error t value Pr(>|t|) (Intercept) 9.52925636 0.133573491 71.34092 2.346301e-31 treatmentf -3.14415561 0.060373900 -52.07806 7.942625e-28 treatments 3.57163712 0.057032666 62.62441 6.817469e-30 height 0.05580995 0.002734287 20.41115 1.580422e-17 tapply(yield,treatment,mean) tapply(height,treatment,mean) #################################### ### October 28 - Model Selection ### #################################### data=read.csv("SurgicalUnit.csv",header=T) attach(data) y=survival x1=clotting x2=prognostic x3=enzyme x4=liver cor(cbind(y,x1,x2,x3,x4)) pairs(cbind(y,x1,x2,x3,x4)) # Note that some relationships look more exponential pairs(cbind(log(y),x1,x2,x3,x4)) # Note that the relationships are more linear temp=lm(y~x1+x2+x3+x4) plot(temp$fit,temp$res) # Note that the residual plot exhibits noncontancy of variance. log.y=log(y) temp=lm(log.y~x1+x2+x3+x4) plot(temp$fit,temp$res) # Note that the residual plot looks much better. dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4) ### Criteria for Model Selection # 1. R^2 or SSE Criterion # Consider the regression model log.y~x1+x2+x3 summary(lm(log.y~x1+x2+x3))$r.sq # R^2 = 0.757 anova(lm(log.y~x1+x2+x3))$Sum[4] # SSE = 3.109 # 2. R^2_adj or MSE Criterion (Choose model with the largest R^2_adj value or smallest MSE value) # Consider the regression model log.y~x1+x2+x3 summary(lm(log.y~x1+x2+x3))$adj.r.sq # Adjusted R^2 = 0.743 anova(lm(log.y~x1+x2+x3))$Mean[4] # MSE = 0.0622 # 3. Cp (Mallow's Cp statistic) Criterion: Cp=SSE.p/MSE.full - (n-2p) (Good model should give a small Cp that is approximately equal to p, where p=no. of parameters.) # Consider the regression model log.y~x1+x2+x3 SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4] MSE.full=anova(lm(log.y~x1+x2+x3+x4))$Mean[5] n=length(log.y); p=4 # p is the no. of parameters (B0, B1, B2, B3) SSE.p/MSE.full - (n-2*p) # Cp = 3.388 #Or alternatively, you can use: extractAIC(lm(log.y~x1+x2+x3),scale=MSE.full)[2] # 4. AIC (Akaike's information criterion): AIC = n*ln(SSE.p/n)+2p (Choose model with the smallest AIC value) # Consider the regression model log.y~x1+x2+x3 SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4] n=length(log.y); p=4 # p is the no. of parameters (B0, B1, B2, B3) n*log(SSE.p/n)+2*p # AIC = -146.161 #Or alternatively, you can use: extractAIC(lm(log.y~x1+x2+x3))[2] # 5. SBC (Schwarz's Bayesian criterion): SBC = n*ln(SSE.p/n)+ln(n)*p (Choose model with the smallest SBC value) SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4] n=length(log.y); p=4 # p is the no. of parameters (B0, B1, B2, B3) n*log(SSE.p/n)+log(n)*p # SBC = -138.205 #Or alternatively, you can use: extractAIC(lm(log.y~x1+x2+x3),k=log(n))[2] # 6. PRESS criterion: PRESS.p = sum(y_i - y_i_hat(when ith observation was deleted))^2 (Choose model with the smallest value) # Consider the regression model log.y~x1+x2+x3 n=length(log.y) e2=array(0,n) for(i in 1:n){ w1=x1[-i];w2=x2[-i];w3=x3[-i] # removing the ith value of x. temp=lm(log.y[-i]~w1+w2+w3) new=data.frame(w1=x1[i],w2=x2[i],w3=x3[i]) pred=predict(temp,new) e2[i]=(log.y[i]-pred)^2 } PRESS.p=sum(e2) # PRESS.p = 3.914 #Or alternatively, you can use: fit=lm(log.y~x1+x2+x3) sum((fit$resid/(1-hatvalues(fit)))^2) ## You can get several of these criteria using the R package "leaps". library(leaps) log.y=log(y) dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4) results=regsubsets(log.y~.,data=dataset,nbest=6,nvmax=4) attributes(summary(results)) outmat=summary(results)$outmat SSE=round(summary(results)$rss,3) R2=round(summary(results)$rsq,3) R2.adj=round(summary(results)$adjr2,3) Cp=round(summary(results)$cp,3) BIC=round(summary(results)$bic,3) # BIC is equivalent to SBC data.frame(outmat,SSE,R2,R2.adj,Cp,BIC) # Another example library(faraway) # will load the 'faraway' package library(help=faraway) data(seatpos) seatpos help(seatpos) reg=lm(hipcenter~.,data=seatpos) summary(reg) results2=regsubsets(hipcenter~.,data=seatpos,nbest=2) summary(results2) #################################### ### Stepwise Regression Methods ### #################################### data=read.csv("SurgicalUnit.csv",header=T) attach(data) y=survival x1=clotting x2=prognostic x3=enzyme x4=liver x5=age x6=gender x7=moderate x8=heavy log.y=log(y) dataset=data.frame(log.y,x1,x2,x3,x4,x5,x6,x7,x8) library(leaps) results=regsubsets(log.y~.,data=dataset,nbest=4,nvmax=8) attributes(summary(results)) outmat=summary(results)$outmat SSE=round(summary(results)$rss,3) R2=round(summary(results)$rsq,3) R2.adj=round(summary(results)$adjr2,3) Cp=round(summary(results)$cp,3) BIC=round(summary(results)$bic,3) # BIC is equivalent to SBC data.frame(outmat,SSE,R2,R2.adj,Cp,BIC) # Using SBC or BIC, the best model includes x1,x2,x3, and x8. # Using Cp, the best model includes x1,x2,x3,x5, and x8. # Using R2.adj, the best model includes x1,x2,x3,x5,x6, and x8. ## Forward Selection ## # Add one variable at a time. # Insert the variable that yields the smallest (t-statistic) p-value. # stage 1: x3 enters the model coefficients(summary(lm(log.y~x1))) pval.x1=coefficients(summary(lm(log.y~x1)))[2,4] pval.x2=coefficients(summary(lm(log.y~x2)))[2,4] pval.x3=coefficients(summary(lm(log.y~x3)))[2,4] pval.x4=coefficients(summary(lm(log.y~x4)))[2,4] pval.x5=coefficients(summary(lm(log.y~x5)))[2,4] pval.x6=coefficients(summary(lm(log.y~x6)))[2,4] pval.x7=coefficients(summary(lm(log.y~x7)))[2,4] pval.x8=coefficients(summary(lm(log.y~x8)))[2,4] pvals=c(pval.x1,pval.x2,pval.x3,pval.x4,pval.x5,pval.x6,pval.x7,pval.x8) which(pvals==min(pvals)) # x3 has the smallest p-value (8.38e-08) # stage 2: x2 joins the model with x3 pval.x3x1=coefficients(summary(lm(log.y~x3+x1)))[3,4] pval.x3x2=coefficients(summary(lm(log.y~x3+x2)))[3,4] pval.x3x4=coefficients(summary(lm(log.y~x3+x4)))[3,4] pval.x3x5=coefficients(summary(lm(log.y~x3+x5)))[3,4] pval.x3x6=coefficients(summary(lm(log.y~x3+x6)))[3,4] pval.x3x7=coefficients(summary(lm(log.y~x3+x7)))[3,4] pval.x3x8=coefficients(summary(lm(log.y~x3+x8)))[3,4] pvalsx3=c(pval.x3x1,pval.x3x2,pval.x3x4,pval.x3x5,pval.x3x6,pval.x3x7,pval.x3x8) which(pvalsx3==min(pvalsx3)) # x2 has the smallest p-value (2.23e-07) # stage 3: x8 joins the model with x3 and x2 pval.x3x2x1=coefficients(summary(lm(log.y~x3+x2+x1)))[4,4] pval.x3x2x4=coefficients(summary(lm(log.y~x3+x2+x4)))[4,4] pval.x3x2x5=coefficients(summary(lm(log.y~x3+x2+x5)))[4,4] pval.x3x2x6=coefficients(summary(lm(log.y~x3+x2+x6)))[4,4] pval.x3x2x7=coefficients(summary(lm(log.y~x3+x2+x7)))[4,4] pval.x3x2x8=coefficients(summary(lm(log.y~x3+x2+x8)))[4,4] pvalsx3x2=c(pval.x3x2x1,pval.x3x2x4,pval.x3x2x5,pval.x3x2x6,pval.x3x2x7,pval.x3x2x8) which(pvalsx3x2==min(pvalsx3x2)) # x8 has the smallest p-value (5.5e-06) # stage 4: x1 joins the model with x3, x2, and x8 pval.x3x2x8x1=coefficients(summary(lm(log.y~x3+x2+x8+x1)))[5,4] pval.x3x2x8x4=coefficients(summary(lm(log.y~x3+x2+x8+x4)))[5,4] pval.x3x2x8x5=coefficients(summary(lm(log.y~x3+x2+x8+x5)))[5,4] pval.x3x2x8x6=coefficients(summary(lm(log.y~x3+x2+x8+x6)))[5,4] pval.x3x2x8x7=coefficients(summary(lm(log.y~x3+x2+x8+x7)))[5,4] pvalsx3x2x8=c(pval.x3x2x8x1,pval.x3x2x8x4,pval.x3x2x8x5,pval.x3x2x8x6,pval.x3x2x8x7) which(pvalsx3x2x8==min(pvalsx3x2x8)) # x1 has the smallest p-value (0.00033) # stage 5: None of the remaining 4 variables can get into the model (all p-values > 0.05) pval.x3x2x8x1x4=coefficients(summary(lm(log.y~x3+x2+x8+x1+x4)))[6,4] pval.x3x2x8x1x5=coefficients(summary(lm(log.y~x3+x2+x8+x1+x5)))[6,4] pval.x3x2x8x1x6=coefficients(summary(lm(log.y~x3+x2+x8+x1+x6)))[6,4] pval.x3x2x8x1x7=coefficients(summary(lm(log.y~x3+x2+x8+x1+x7)))[6,4] pvalsx3x2x8x1=c(pval.x3x2x8x1x4,pval.x3x2x8x1x5,pval.x3x2x8x1x6,pval.x3x2x8x1x7) pvalsx3x2x8x1 # All p-values are > 0.05 --> stop # Therefore, the best model obtained using the forward selection is # log.y~x1+x2+x3+x8 # Or you can use: reg.null=lm(log.y~1,data=dataset) reg.full=lm(log.y~.,data=dataset) step(reg.null,scope=list(lower=reg.null,upper=reg.full),direction="forward") # Criterion used is AIC, not t-statistic. ## Backward Selection ## # Start with all the variables in the model, then remove one at a time. coefficients(summary(lm(log.y~x1+x2+x3+x4+x5+x6+x7+x8))) # Remove x4. coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x7+x8))) # Remove x7. coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x8))) # Remove x5. coefficients(summary(lm(log.y~x1+x2+x3+x6+x8))) # Remove x6. coefficients(summary(lm(log.y~x1+x2+x3+x8))) # All p-values are now < alpha # Therefore, the best model obtained using the forward selection is # log.y~x1+x2+x3+x8 # Or you can use: reg.full=lm(log.y~.,data=dataset) step(reg.full,direction="backward") # Criterion used is AIC, not t-statistic. ## (Forward) Stepwise Selection ## # This procedure is just like in the 'forward' selection. The only difference is, # every time you include a new variable into the model, you need to evaluate whether # any of the variables already in the model should now be removed (based on their t-statistic # p-value). See page 365 of our textbook for more information. # R command using the AIC criterion reg.null=lm(log.y~1,data=dataset) reg.full=lm(log.y~.,data=dataset) step(reg.null,scope=list(lower=reg.null,upper=reg.full),direction="both") # Criterion used is AIC, not t-statistic. ## Validation temp=sample(1:54,20) data.validation=data[temp,] data.training=data[-temp,] data.valid=read.csv("SurgicalValidation.csv",header=T) attach(data.valid) model.1=lm(log(survival)~clotting+prognostic+enzyme+heavy) MSPR=sum(model.1$residuals^2)/length(survival) # MSPR = 0.07026 coefficients(summary(lm(log.y~x1+x2+x3+x8))) #################################### ### Model Building - Diagnostics ### #################################### x1=c(14,19,12,11) x2=c(25,32,22,15) y=c(301,327,246,187) X=cbind(array(1,4),x1,x2) H=X%*%solve(t(X)%*%X)%*%t(X) # Computing the hat matrix y.hat=H%*%y # Will give the predicted values(see table on page 393) e=(diag(4)-H)%*%y # Will give the residuals h=diag(H) # Extracting the main diagonal from the hat matrix n=length(x1) MSE=sum(e^2)/(n-3) s2.e=MSE*(diag(4)-H) # Will give the variance and co-variance matrix s.e=sqrt(diag(s2.e)) r.stud=e/s.e # Will give the studentize residuals d=e/(1-h) # Will give the studentize deleted residuals # Or you can use these built-in R function: results=lm(y~x1+x2) hatvalues(results) # Will give the main diagonal of the H matrix. rstandard(results) # Will give the studentized residuals. rstudent(results) # Will give the studentized deleted residuals. # Body fat example data=read.csv("BodyFat.csv",header=T) attach(data) results.fat=lm(fat~triceps+thigh,data=data) e=round(results.fat$residuals,3) sse=sum(e^2) n=length(e) h=round(hatvalues(results.fat),3) t=round(e*sqrt((n-3-1)/(sse*(1-h)-e^2)),3) data.frame(residuals=e,h=h,t=t) # Table 10.3 on page 397 t.crit=qt(1-.10/(2*n),df=n-3-1) which(abs(t)>t.crit) # Leverage which(h>2*mean(h)) # Note that sum(h)=p ==> mean(h)=p/n # Deletion statistics dff=dffits(results.fat) # Compare abs(dffits) with 1 for small to medium size data and 2*sqrt(p/n) for large data sets. dfb=dfbetas(results.fat) # Compare abs(dfbeta) with 1 (if small or medium data) or 2/sqrt(n) (for large data sets) cooks=cooks.distance(results.fat) # Compare with qf(.10,p,n-p) or qf(.20,p,n-p) round(data.frame(dffits=dff,cooks=cooks,dfbetas=dfb),3) # Index plots plot(dff,type='b',main="DFFITS Values") summary(influence.measures(results.fat)) # This will give a summary of influential cases. # Variance Inflation Factor (If VIF > 10, there is a strong collinearity) library(car) # Install and load R package 'car' results1=lm(fat~thigh+midarm) vif(results2) # thigh midarm # 1.00722 1.00722 # Note that all VIFs are small and very close to 1. # Therefore, we don't have collinearity problem. cor(thigh,midarm) #cor=0.0846675 results2=lm(fat~thigh+midarm) vif(results2) # triceps thigh # 6.825239 6.825239 # Note that all VIFs are < 10, but they are considerably larger than 1. # Therefore, we see some collinearity problem. cor(thigh,triceps) #cor=0.9238425 # Now, suppose we include all 3 predictors into the model. results3=lm(fat~triceps+thigh+midarm) vif(results3) # triceps thigh midarm # 708.8429 564.3434 104.6060 # Note how large the VIFs, indicating strong collinearity. summary(lm(midarm~triceps+thigh))$r.sq #r.sq=0.9904403 # Note how big R^2 is, indicating the strong linear correlation # between midarm and the other 2 predictors. ## Surgical Unit Example data2=read.csv("SurgicalUnit.csv",header=T) attach(data2) y=survival x1=clotting x2=prognostic x3=enzyme x4=liver x5=age x6=gender x7=moderate x8=heavy log.y=log(y) results2=lm(log.y~x1+x2+x3+x8) vif(results2) # All VIFs are small. Therefore, we don't have collinearity problem. ########################################## ### Model Building - Remedial Measures ### ########################################## ## Method of Weighted Least Squares # Minimize Q=sum(wi*(residuals^2)), where wi=1/sigma_i^2 ## Using Robust Regression to dampen the effect of influential cases # 1. Least Absolute Residuals (LAR) or Least Absolute Deviations (LAD) Regression # min(sum(abs(residuals))) data=read.csv("BodyFat.csv",header=T) attach(data) results.fat=lm(fat~triceps+thigh,data=data) library(quantreg) rq(fat~triceps+thigh,data=data) # 2. Least Median of Squares (LMS) Regression # min(median(residuals^2)) ## Nonparametric Regression: Lowess (locally weighted regression scatter plot smoothing) Method # 1. The linear regression is weighted to give cases further from the middle X level # in each neighborhood smaller weights # 2. To make the procedure robust to outlying observations, the linear regression fitting # is repeated, with the weights revised so that cases that had large residuals in the # fitting receive smaller weights in the second fitting # 3. To improve the robustness of the procedure further, step 2 is repeated one or more # times by revising the weights according to the size of the residuals in the latest # fitting lowess(triceps,fat) plot(lowess(triceps,fat),type='l') points(triceps,fat) par(mfrow=c(2,2)) plot(lm(fat~triceps)) ########################################### ### Autocorrelation in Time Series Data ### ########################################### u=c(0, 0.5, -0.7, 0.3, 0, -2.3, -1.9, 0.2, -0.3, 0.2, -0.1) x=0:10 # epsilon_0 = 3 eps=array(3,11) for(i in 2:11) { eps[i]=eps[i-1]+u[i] } y=2+0.5*x+eps data.frame(u,eps,y) plot(x,eps) plot(x,y,ylim=c(0,10)) abline(a=2,b=.5,col="darkgreen") abline(lm(y~x)) # epsilon_0 = -.2 u=c(0,rnorm(10)) eps=array(-.2,11) for(i in 2:11) { eps[i]=eps[i-1]+u[i] } y=2+0.5*x+eps data.frame(u,eps,y) plot(x,eps) plot(x,y,ylim=c(0,10)) abline(a=2,b=.5,col="darkgreen") abline(lm(y~x)) ## Blaisdell Company Example data=read.csv("BlaisdellCompany.csv",header=T) attach(data) results=lm(Company~Industry) summary(results)$coef # Estimate Std. Error t value Pr(>|t|) # (Intercept) -1.4547500 0.21414605 -6.793261 2.314419e-06 # Industry 0.1762828 0.00144474 122.016981 1.013155e-27 anova(results)$Mean[2] # MSE = 0.007405683 # Calculating the Durbin-Watson Statistics e=results$residuals D=sum(diff(e)^2)/sum(e^2) # D=0.7347256 (Compare with critical values from table B.7) library(car) durbinWatsonTest(results) # lag Autocorrelation D-W Statistic p-value # 1 0.6260046 0.7347256 0 # Alternative hypothesis: rho != 0 durbinWatsonTest(results,alternative="positive") # lag Autocorrelation D-W Statistic p-value # 1 0.6260046 0.7347256 0 # Alternative hypothesis: rho > 0 # Or you can use 'dwtest' from 'lmtest' package dwtest(results, alternative="greater") # Durbin-Watson test # data: results # DW = 0.7347, p-value = 0.0001748 # alternative hypothesis: true autocorrelation is greater than 0 ### Remedial Measures for Autocorrelation ## Cochrane-Orcutt Procedure n=length(e) t=2:n as.table(cbind( '(1)' = e, '(2)' = c(NA, e[t-1]), '(3)' = c(NA, e[t-1] * e[t]), '(4)' = c(NA, e[t-1]^2))) # (1) (2) (3) (4) # 1 -0.0260518571 # 2 -0.0620154481 -0.0260518571 0.0016156176 0.0006786993 # 3 0.0220209610 -0.0620154481 -0.0013656398 0.0038459158 # 4 0.1637542388 0.0220209610 0.0036060257 0.0004849227 # 5 0.0465704946 0.1637542388 0.0076261159 0.0268154507 # 6 0.0463765906 0.0465704946 0.0021597808 0.0021688110 # 7 0.0436170636 0.0463765906 0.0020228107 0.0021507882 # 8 -0.0584354347 0.0436170636 -0.0025487821 0.0019024482 # 9 -0.0943990257 -0.0584354347 0.0055162481 0.0034147000 # 10 -0.1491424634 -0.0943990257 0.0140789032 0.0089111760 # 11 -0.1479908977 -0.1491424634 0.0220717270 0.0222434744 # 12 -0.0530535560 -0.1479908977 0.0078514434 0.0219013058 # 13 -0.0229282395 -0.0530535560 0.0012164246 0.0028146798 # 14 0.1058516073 -0.0229282395 -0.0024269910 0.0005257042 # 15 0.0854637991 0.1058516073 0.0090464805 0.0112045628 # 16 0.1061022402 0.0854637991 0.0090679005 0.0073040610 # 17 0.0291124001 0.1061022402 0.0030888909 0.0112576854 # 18 0.0423164641 0.0291124001 0.0012319338 0.0008475318 # 19 -0.0441602515 0.0423164641 -0.0018687057 0.0017906831 # 20 -0.0330086858 -0.0441602515 0.0014576719 0.0019501278 e.y=e[t]; e.x=e[t-1] temp=lm(e.y~0+e.x) coef(temp) # r=0.6311636 # Or r=sum(e.y*e.x)/sum(e.x^2) # r=0.6311636 y=Company; x=Industry y.p=y[t]-r*y[t-1] x.p=x[t]-r*x[t-1] results.p=lm(y.p~x.p) summary(results.p)$coef Estimate Std. Error t value Pr(>|t|) (Intercept) -0.3941105 0.167229910 -2.356699 3.069370e-02 x.p 0.1737583 0.002956708 58.767478 4.432471e-21 durbinWatsonTest(results.p,alternative="positive") # lag Autocorrelation D-W Statistic p-value # 1 0.1473569 1.650248 0.154 # Alternative hypothesis: rho > 0 dwtest(results.p,alternative="greater") # Durbin-Watson test # data: results.p # DW = 1.6502, p-value = 0.1517 # alternative hypothesis: true autocorrelation is greater than 0 # Note: You can repeat this process if needed up to 3 iterations. b0=(-.3941105)/(1-.6311636) # -1.068524 b1=0.1737583 # Company = -1.0685 + 0.17376*Industry se.b0=0.16723/(1-.6311636) # 0.4533989 se.b1=0.002956708 anova(results.p)$Mean[2] # MSE = 0.0045(estimate of variance of disturbance, u_t) # Note: D is approximately equal to 2(1-r) ## Hildreth-Lu Procedure hildreth.lu=function(rho, y, x) { t=2:length(y) y.p=y[t]-rho*y[t-1] x.p=x[t]-rho*x[t-1] results.p=lm(y.p~x.p) sse=anova(results.p)$Sum[2] return(sse) } hildreth.lu(.10,Company,Industry) rho=seq(0,1,by=.01) m=length(rho) SSE=array(0,m) for(i in 1:m){ SSE[i]=hildreth.lu(rho[i],Company,Industry)} data.frame(rho,SSE) plot(rho,SSE,type='l') rho[which(SSE==min(SSE))] # 0.96 y.lu=y[t]-0.96*y[t-1] x.lu=x[t]-0.96*x[t-1] results.lu=lm(y.lu~x.lu) summary(results.lu)$coef Estimate Std. Error t value Pr(>|t|) (Intercept) 0.0711732 0.057974923 1.227655 2.363049e-01 x.p 0.1604536 0.006840127 23.457688 2.177123e-14 anova(results.lu)$Mean[2] # 0.004215952 dwtest(results.lu,alternative="greater") # Durbin-Watson test # data: results.lu # DW = 1.7254, p-value = 0.2816 # alternative hypothesis: true autocorrelation is greater than 0 # Note: This procedure does not require any iterations once # an estimate of rho is obtained. b0=0.0711732/(1-.96) # 1.77933 b1=0.1604536 # Company = 1.77933 + 0.1604536 se.b0 = 0.057974923/(1-.96) # 1.449373 se.b1 = 0.00684 ## First Differences Procedure (Use rho = 1) y=Company; x=Industry n=length(y) t=2:n y.diff=y[t]-y[t-1] # Or y.dff=diff(y) x.diff=x[t]-x[t-1] results.diff=lm(y.diff~0+x.diff) summary(results.diff)$coef # Estimate Std. Error t value Pr(>|t|) # x.diff 0.1684878 0.005096004 33.06272 1.435445e-17 anova(results.diff)$Mean[2] # 0.00481522 dwtest(results.diff,alternative="two.sided") # Durbin-Watson test # data: results.diff # DW = 1.7389, p-value = 0.6558 # alternative hypothesis: true autocorrelation is not 0 b0=mean(y)-0.1684878*mean(x) # 1.128648 b1=0.1684878 # Company = -0.3040115 + 0.1684878*Industry # Forecasting sqrt(0.0045*(1+1/20+(66.929-mean(x.p))^2/sum((x.p-mean(x.p))^2))) ######################################## ### (11/30/16) Nonlinear Regression ### ######################################## # 1. Exponential Regression Models # Example of Type I curve(5*exp(x/2),0,10) x=sample(1:10,100,replace=T) y=5*exp(x/2)+rnorm(100,mean=0,sd=20) plot(x,y) nls(y~b*exp(a*x),start=list(a=1,b=3)) curve(4.8003*exp(x*0.5049),0,10,add=T,col="darkred",lwd=2) data=read.csv("InjuredPatients.csv",header=T) attach(data) Y=index; X=days results=nls(Y~b*exp(a*X),start=list(a=0,b=50)) plot(X,Y,xlab="Days Hospitalized",ylab="Prognostic Index") curve(58.60653*exp(-0.03959*x),0,70,add=T,col="darkred",lwd=2) Y.fitted=58.60653*exp(-0.03959*X) residuals=Y-Y.fitted plot(Y.fitted,residuals,main="Residual Plot") # Or you can also use the 'glm' function log.lin <- glm(Y~X, family=quasi(link="log", variance="constant")) coef(log.lin) # (Intercept) X # 4.07084672 -0.03958645 # Note: exp(4.07084672)=58.60656 # Example of Type II curve(100-50*exp(-2*x),0,2) x2=runif(100,0,2) y2=100-50*exp(-2*x2)+rnorm(100,mean=0,sd=2) plot(x2,y2) results2=nls(y2~a+b*exp(c*x2),start=list(a=1,b=-1,c=-1)) round(coef(results2),2) # a b c # 100.32 -50.05 -1.98 curve(100.32-50.05*exp(x*(-1.98)),0,2,add=T,col="darkred",lwd=2) # 2. Logistic Regression Models curve(10/(1+20*exp(-2*x)),0,4,lwd=2) x=sample(0:5,100,replace=T) y=10/(1+20*exp(-2*x))+rnorm(100,mean=0,sd=.5) plot(x,y) nls(y~c/(1+b*exp(a*x)),start=list(a=-2,b=20,c=10)) curve(10.131/(1+16.623*exp(-1.855*x)),0,5,col="darkred",lwd=2,add=T) # Problem 13.10: Enzyme Kinetics data.kinetics=read.csv("data_kinetics.csv",header=T) attach(data.kinetics) y=velocity; x=concentration y.p=1/y x.p=1/x results=lm(y.p~x.p) #bo=0.03376 and b1=0.45401 nls(y~a*x/(b+x),start=list(a=1/0.03376,b=0.454/0.03376)) plot(concentration,velocity) curve(28.14*x/(12.57+x),0,40,add=T,lwd=2) y.fitted=28.14*x/(12.57+x) residuals=y-y.fitted plot(y.fitted,residuals) ############################################ ### (11/25/12) Generalized Linear Models ### ############################################ # 1. Probit Mean Response Function curve(pnorm(0+1*x),-3,3,lwd=2) curve(pnorm(0+2*x),-3,3,lwd=2,col="red",add=T) curve(dnorm(x,0,1),-3,3) curve(exp(x)/(1+exp(x))^2,-3,3,add=T,col="red") curve(dnorm(x,0,sd=pi/sqrt(3)),-3,3,add=T,col="blue") # Simulation example: n=1000 x=sample(0:10,n,replace=T) line=0.8*x-4 pies=pnorm(line) plot(x,pies) y=(runif(n).7) # Using 0.7 as cut off error=success-prediction data.frame(success,predicted=results$fitted,prediction,error) percent.error=sum(abs(error))/length(success) percent.error false.pos=sum((success==0)*(prediction==1)) false.neg=sum((success==1)*(prediction==0)) # Finding the best cut off value cutoff=seq(0,1,by=.05) m=length(cutoff) n=length(success) False.pos=array(99,m) False.neg=array(99,m) Errors=array(99,m) for(i in 1:m) { prediction=as.numeric(results$fitted>cutoff[i]) error=success-prediction False.pos[i]=sum((success==0)*(prediction==1)) False.neg[i]=sum((success==1)*(prediction==0)) Errors[i]=False.pos[i]+False.neg[i] } Percent.error=Errors/n data.frame(CutOff=cutoff,FalsePositive=False.pos,FalseNegative=False.neg,ErrorRate=Percent.error) # Hosmer-Lemeshow Goodness of Fit Test library(ResourceSelection) hoslem.test(success,results$fitted) ############################################################### ### (11/27/12) Multiple logistic regression and Diagnostics ### ############################################################### # Multiple logistic regression dystrophy=read.csv("Dystrophy.csv",header=T) attach(dystrophy) dystrophy[!complete.cases(dystrophy),] # lists rows with missing values dys.naomit=na.omit(dystrophy) # creates new data set without missing data temp=which(is.na(PK)=="TRUE") results.dys <- glm(carrier ~ AGE + M + CK + H + PK + LD, data = dystrophy, family =binomial(link="logit")) results.dys <- glm(carrier ~ AGE + M + CK + H + PK + LD, data = dys.naomit, family =binomial(link="logit")) carrier.naomit=dys.naomit[,10] hoslem.test(carrier.naomit,results.dys$fit)