## ## section7.R - interaction term goodness (msot code by mb) ## oct 29, 2009 ## discrete case ## note that the continous case is the same thing, but ## you use continous var instead of a discrete var. ## the end result there is that you fit a lot of different models ## as opposed to just two. ## note: i'm not going to go over this in class, really. ## this is stuff you've had practice on already. ############################################################# ## generate some data (ignore this) beers <- rpois(n=100,3) water <- rbinom(n=100, 1,.4) hangovers <- 4 + .5*beers - .4*water + (-.55)*beers*water + rnorm(100,0,.5) ## hangover length ## Run some OLS regressions alcohol.model <- lm(hangovers ~ beers) summary(alcohol.model) water.model <- lm(hangovers ~ beers + water) summary(water.model) ## Let's say that we have some reason to think that there ## is an interaction between beers and water. We can specify ## the interaction model in R in two ways. ## First is the colon method ":". You can add an interaction ## term by including var1:var2 in the regression model. ## Note that you have to manually add the lower-order terms ## (var1 and var2) seperately. int.model <- lm(hangovers ~ beers + water + beers:water) summary(int.model) ## Second is the asterick method. You can add an interaction ## term by including var1*var2 in the lm function call. This ## will add the interaction term and all of the lower order terms ## as well. int.model2 <- lm(hangovers ~ beers*water) summary(int.model2) ## Now we'll do some plotting with these regressions to ## demonstrate the interaction between the two variables plot(x=beers, y=hangovers) ## ok, but this needs some labels plot(x=beers, y=hangovers, xlab="beers consumed", ylab="length of hangovers", main="self-study of alcohol consumption") ## hm, well, we still don't know which points are water-drinking nights ## and which are not water drinking nights. so let's separate them by ## color: plot(x=beers[water==0], y=hangovers[water==0], xlab="beers consumed", ylab="length of hangovers", main="self-study of alcohol consumption", col="orange") points(x=beers[water==1], y=hangovers[water==1], col="olivedrab") ## ok, awesome, but that's a little hard to see, especially if ## someone prints this in b&w. let's add different plotting characters ## with the "pch" option: plot(x=beers[water==0], y=hangovers[water==0], xlab="beers consumed", ylab="length of hangovers", main="self-study of alcohol consumption", col="orange", pch=19) points(x=beers[water==1], y=hangovers[water==1], col="olivedrab", pch=17) ## hm, i think the bottom of the graph is being cut off because ## we're plotting the no-water points first. we need to specify ## the range of y-axis. in order to do this smartly, we can ## use the range() function: range(hangovers) ## this gives us the vector c(min(hangovers), max(hangovers)). If we ## put this in the "ylim" option in the plot() commmand, the window ## will contain the full range of hangovers: plot(x=beers[water==0], y=hangovers[water==0], xlab="beers consumed", ylab="length of hangovers", main="self-study of alcohol consumption", col="orange", pch=19, ylim=range(hangovers)) points(x=beers[water==1], y=hangovers[water==1], col="olivedrab", pch=17) ## now that we have nicely plotted all the points, we need to add ## the regression lines. remember our model: summary(int.model) ## we're going to have to compute the slopes using the coefficients ## from the model, but we should do this in a smart way, without ## using hard-coded numbers. for this, we'll use the coef() function, ## which we can run on the output from an lm() call: coef(int.model) coef(int.model)["(Intercept)"] coef(int.model)["beers:water"] ## thus, if water == 0, we can easily draw the line: abline(a=coef(int.model)["(Intercept)"], b=coef(int.model)["beers"]) ## if water==1, then the slope is slightly harder, so i'll break ## it up into 2 steps: h2o.slope <- coef(int.model)["beers"] + coef(int.model)["beers:water"] h2o.int <- coef(int.model)["(Intercept)"] + coef(int.model)["water"] ## now we'll plot that line: abline(a=h2o.int, b=h2o.slope) ## whew, that was hard, but now we have a problem with the ## color and style of the lines. someone reading in b&w ## won't be able to tell the difference between them. so, ## we'll use the "col" and "lyt" (line type) options in ## abline(): plot(x=beers[water==0], y=hangovers[water==0], xlab="beers consumed", ylab="length of hangovers", main="self-study of alcohol consumption", col="orange", pch=19, ylim=range(hangovers)) points(x=beers[water==1], y=hangovers[water==1], col="olivedrab", pch=17) abline(a=coef(int.model)["(Intercept)"], b=coef(int.model)["beers"], col="orange", lty=1) abline(a=h2o.int, b=h2o.slope, col="olivedrab", lty=2) ## you can also make a nice legend: legend(x="topleft", legend=c("water before bed", "no water"), col=c("olivedrab","orange"), pch=c(17,19), lty=c(2,1), bty="n") ## maybe you're thinking that two plotting commands for the ## points is too much. we can do it in one step. first, let's ## review how R handles the plotting options. note that if pch ## and col are vectors, R matches the first element of pch to the ## first plotting point and so on. plot(x=c(1,2,3,4), y=c(1,2,3,4), pch=c(15,16,17,18,19), col=c("red","blue"), cex=3) ## thus, we could use the "ifelse" command to create a vector that ## has the same size as the data and contains the plotting options ## for that observation. ifelse(water==1,"olivedrab","orange") head(cbind(water,ifelse(water==1,"olivedrab","orange"))) head(cbind(beers,water,ifelse(water==1,"olivedrab","orange"))) ## using these commands in the "col" and "pch" options will save us lines ## of code: plot(x=beers, y=hangovers, xlab="beers consumed", ylab="length of hangovers", main="self-study of alcohol consumption", col=ifelse(water==1,"olivedrab","orange"), pch=ifelse(water==1,17,19))