########################################### hi. ## section 5 notes ## october 15, 2009 ## gov2k ## prepared by msen :) ## on the agenda: ## Simple linear regression using lm(), ## sampling for OLS estimates (marginal and joint), ## confidence and preditive bands, ## detecting outliers and violations of OLS assumptions ########################################### Example 1 ## We'll use some data in the car library ## If you don't have the car library yet, install the package with ## install.packages("car") ## Once the car package is installed, load the car library with library(car) ## we'll play around with the Burt dataset within the car library ## This data frame contains the following columns: ## IQbio IQ of twin raised by biological parents ## IQfoster IQ of twin raised by foster parents ## class A factor with levels (note: out of order): high; low; medium. ## (Let's ignore this last one for now) head(Burt) ## to get a feed for the data ## In the simple OLS case (i.e. one independent variable) we can plot ## the dep. variable against the indep. variable to gather some ## insight-by-inspection about how the variables relate plot(Burt$IQbio, Burt$IQfoster, xlab = "IQ, Raised by Biological Family", ylab = "IQ, Raised by Foster Family", pch = 19) ## You know the analytic formulas for the values lm() returns, ## but lm() can save time (and lets you double check your results ## when we won't let you use lm() for your final answer on a problem set) ## The '~' separates your dependent variable from the independent variables. ## You can store the output of lm() to access other useful functions ## and methods my.lm <- lm(Burt$IQfoster ~ Burt$IQbio) my.lm summary(my.lm) ## pause ## let's examine the following on the board fost <- c(97, 85, 115) bio <- c(95,83,108) plot(bio, fost) lm(fost~bio) ## this is what we calculate by hand ## returning to our example from above: ## To see what we can extract from our lm object 'my.lm', we use names(my.lm) coefficients(my.lm) ## so the equation for the line is burt$IQfoster = 13.21 + 0.86*burt$IQbio coefficients(my.lm)[1] coefficients(my.lm)["(Intercept)"] ## let's take a look at the fitted values names(my.lm) my.lm$fitted.values ## let's take a look at the residuals, ## which R calculates as the actual values minus fitted values. my.lm$residuals ## you can also sum the square of the residuals sum(my.lm$residuals^2) ## and extrat the R-sq value: summary(my.lm) ## note -- this is a very high R^2 value, which led people to ## attack Burt and say he was fabricating his data! ## let's plot our regression line onto the plot plot(Burt$IQbio, Burt$IQfoster, xlab = "IQ, Raised by Biological Family", ylab = "IQ, Raised by Foster Family") abline(my.lm, col = "darkred") ## Let's add the residual lines onto the plot plot(Burt$IQbio, Burt$IQfoster, xlab = "IQ, Raised by Biological Family", ylab = "IQ, Raised by Foster Family", pch = 19) abline(my.lm, col = "darkred") segments(x0=Burt$IQbio, x1=Burt$IQbio, y0=Burt$IQfoster, y1=my.lm$fitted, col="red") ########################################### Sampling ## This came up in lecture and we will go over this on the pset ## let's generate some data and pretend that it's the true population. ## we can do this with the rnorm function x <- rnorm(5000, mean = 7, sd = 1.56) ## just some normally distributed data y <- 12-.4*x + rnorm(5000, mean = 0, sd = 1) ## we're establishing here a linear relationship, ## so that y = 12 + .4x + some residual value ## to check out the histograms separately par(mfrow = c(1,2)) hist(x, col = "cyan") hist(y, col = "pink") ## you know they are normal because?? ## plotting them together: par(mfrow = c(1,1)) plot(x, y) ## you see they are roughly linear fake.lm <- lm(y~x) fake.lm plot(x,y) abline(fake.lm, col = "coral") ## and if you really want, you can add the residuals ## segments(x0=x, x1=x, y0=y, y1=fake.lm$fitted, col="cadetblue") ## let's sample from this the way we did in the univariate statistics part of the course: data <- data.frame(x,y) size <- 500 my.samp <- data[sample(nrow(data),size),] lm(my.samp$y ~ my.samp$x) ## so we can set up a for loop to do this thousands of times, ## and store the info sims <- 1000 size <- 500 holder <- matrix(data = NA, ncol = 2, nrow = sims) colnames(holder) <- c("intercept", "slope") for(i in 1:sims){ my.samp <- data[sample(nrow(data),size),] samp.lm <- lm(my.samp$y ~ my.samp$x) holder[i,1] <- samp.lm$coefficients[1] holder[i,2] <- samp.lm$coefficients[2] } ## this could take a while depending ## on the number of simulations you're performing! ## and now we can plot the marginal distribution ## of the slope and intercept: par(mfrow = c(1,2)) plot(density(holder[,1]), main = "intercept") abline(v = mean(holder[,1])) plot(density(holder[,2]), main = "slope") abline(v = mean(holder[,2])) ## two questions: ## 1. why do they look normal? (and why does that make sense?) ## 2. what's the mean of these two distributions? (and why does that make sense?) ## let's look at the joint sampling distribution par(mfrow = c(1,1)) plot(holder[,1], holder[,2], xlab = "intercept", ylab = "slope", main = "Joint Sampling Distribution of Slope and Intercept") ## and also plot(x,y, main = "Sampling Distribution of Regression Lines") segments(x0=0, x1=100, y0=holder[,1], y1=(holder[,1] + 100*holder[,2]), col="red") ## another way of doing the same plot plot(x,y, main = "Sampling Distribution of Regression Lines") for (i in 1:nrow(holder)) { abline(a=holder[i,1], b=holder[i,2], col = "purple") } ## you can also used the predict (predict.lm) function to draw up the ## confidence bands and predictive bands that Adam did in lecture: ruler <- data.frame(x = c(1:12)) predict(lm(y ~ x)) predict(lm(y ~ x), ruler, se.fit = TRUE) ## setting things up pred.pred <- predict(lm(y ~ x), ruler, interval="prediction") pred.conf <- predict(lm(y ~ x), ruler, interval="confidence") ## calculating the values ## and now to creat plots like the ones Adam has: ## for the predictive bands matplot(ruler, pred.pred, type="l") ## matplot will the columns of one matrix against the columns of another points(x,y) ## if you want to overlay the observations ## for the confidence bands matplot(ruler, pred.conf, type="l") points(x,y) ########################################### Diagnostics ## You can also perform some of the diagnostics we discussed in lecture ## very simply. ## Let's do this with the Leinhardt data, also ## from the car library ## This data frame contains the following columns: ## income: Per-capita income in U. S. dollars. ## infant: Infant-mortality rate per 1000 live births. ## region: A factor with levels: Africa; Americas; Asia, Asia and Oceania; Europe. ## oil: Oil-exporting country. A factor with levels: no, yes. ## we will disregard the last two vars summary(Leinhardt) head(Leinhardt) Leinhardt <- na.omit(Leinhardt) income <- Leinhardt$income infant <- Leinhardt$infant plot(income, infant) ## linear? my.lm <- lm(infant ~ income) ## fitting a linear regression model plot(income, infant) abline(my.lm, col = "darkred") ## plotting the two together ## let's do some diagnostics ######################## leverage ## The "hat" values are a measure of whether ## the observation is far from the center of the distribution ## of the explanatory variable (i.e., has leverage). ## Large hat values are an indication that the observation is potentially influential. ## to calculate them, we can use the "hatvalues" ## function from the car library hatvalues(my.lm) ## more useful is to plot these to get a sense ## of what's going on plot(income, hatvalues(my.lm)) abline(h = 4/length(income), col = "darkred") ## looks like there might be some problems identify(income, hatvalues(my.lm), tolerance = "10") ######################## influence points stud.res <- rstudent(my.lm) #calculates studentized residuals plot(hatvalues(my.lm), rstudent(my.lm)) ## plots hat values against studentized residuals abline(h = 2, col = "darkred") abline(v = 4/length(income), col = "darkred") ######################## non-linearity plot(income, infant) plot(my.lm,1) ## here, you are looking for the red line to be relatively flat ######################## non-constant error variance plot(my.lm,3) ## Again you want the line to be more or less flat across the plot ## if it's sloped, then we might have non-constant error variance ######################## non-normality ## we can look at a histogram of the residuals hist(my.lm$residuals) ## ugh ## or, better yet, we can look at a QQ plot of the residuals ## following the lecture notes, we ## 1. extract the residuals from the lm output ## 2. sort them ## 3. normalize them ## 4. plot them in a qqplot against a standard normal my.lm <- lm(income ~ infant) residuals <- sort(my.lm$residuals) residuals <- (residuals - mean(residuals))/sd(residuals) theoretical <- rnorm(length(residuals)) qqplot(theoretical, residuals) ## ouch. ########################################### FNS that return matrixes ## There is nothing special about returning a matrix with a function ## Here is a function (unrelated to this week's problem set) ## which shows the mechanics my.function <- function(x,y){ #x and y are vectors mat <- matrix(data = NA, nrow = 2, ncol = 3) x.sum <- sum(x) mat[1,1] <- x.sum y.sum <- sum(y) mat[1,2] <- y.sum xy.sum <- sum(x.sum, y.sum) mat[1,3] <- xy.sum ## You can also fill whole rows (or columns) at once with a vector: mat[2,] <- c(mean(x), mean(y), mean(c(x,y))) colnames(mat) <- c("x calculations", "y calculations", "x and y calculations") rownames(mat) <- c("sum", "mean") return(mat) } my.function(x = c(1:10), y = c(23:40)) fun.out <- my.function(x = c(1:10), y = c(23:40)) fun.out[1,2] fun.out["sum", "y calculations"]