## Section 3 R Notes ## Gov2000 Section Notes ## September 24, 2009 ## Maya Sen ## Confidence Intervals and Hypothesis Testing ## Thanks to Matt Blackwell, last year's TF for data and code ##################################### ## converting to a standard normal ## ##################################### ## a key concept at work in everything we talked about this week is ## that you can transform any normal back to a standard normal, ## which has a mean of 0 and a standard deviation of 1. ## see explanation on the board? ## by substracting the mean and dividing by the standard deviation: samp <- rnorm(1000, mean = 69.1, sd = 2.9) ## men's heights in the US hist(samp) transformed.samp <- (samp - 69.1)/2.9 ## substracting mean, divindg by sd hist(transformed.samp) ## putting those on the same line par(mfrow = c(1,2)) hist(samp) hist(transformed.samp) ################################# ## Let's play around with data ## ################################# load("fulton.RData") ## Whenever you load up a dataset, but sure to play around ## with it for a bit. You should get to know how big it is, ## what the columns are and whatnot. ## get the column names of the data: names(precincts) ## get the first six rows of the data: head(precincts) ## get the dimensions of the data: dim(precincts) ## This data is election data from Georgia. This ## data (precincts) is data aggregated to the precinct level. ## So this measures the turnout rate, percent Black, percent ## Female, mean age, turnout in the democratic primary, ## turnout in the republican primary, is the precinct in Atlanta, ## and location dummies for the polling stations. ## Our goal in section today will be to estimate the percent ## Black in Fulton county. ## this is what that variable looks like: plot(density(precincts$black)) ## anything weird about this?? ########################### ## Calculating True Mean ## ########################### ##To do this, we'll take the mean of ## percent Black in each of the precincts. ## Note: a really quick and easy way to calculate proportions ## is to use the mean function: true.mean <- mean(precincts$black) true.mean ## this is the TRUE population ## we can also use the mean function to calculate other ## statistics, even those involving binary variables mean(precincts$school) ## proportion of precincts where people ## vote at a school mean(precincts$church) ## at a church mean(precincts$firest) ## at a fire station ############## ## Sampling ## ############## ## remember that we can use indexing to get certain rows ## of the data. If I wanted to get just rows 10, 3 and 200 ## in that order, I would say... precincts[c(10,3,200),] ## What if we want to do a random sample of the datasets? ##We'll use the "sample" command from previous weeks to give ##us a random sample of the row numbers. ## First, ALWAYS check if you have to set the seed! set.seed(12345) ## setting seed to 12345 ## Now... n.precincts <- nrow(precincts) ## number of rows N <- 40 ## sample size rand.rows <- sample(n.precincts, size=N, replace=FALSE) rand.rows ## Now that we have the random rows, we can grab those random rows ## from the actual data and put it into a new matrix: mypoll <- precincts[rand.rows,] ## or we can put everything on one line mypoll <- precincts[sample(nrow(precincts), size = N, replace = FALSE),] ## and we can treat this as a nice little subset of the data summary(mypoll) head(mypoll) dim(mypoll) #################### ## Conf Intervals ## #################### ## recall the formula for the 95% CI ## let's calcuate the mean, standard deviation and 95% CI for this sample mu.b <- mean(mypoll$black) sd.b <-sd(mypoll$black) mu.b sd.b ## we'll just use the formulas for confidence intervals mu.b + 1.96*sd.b/sqrt(N) mu.b - 1.96*sd.b/sqrt(N) ## where does the 1.96 come from? ## (draw on board.) par(mfrow = c(1,1)) ruler <- seq(-4,4,.01) plot(ruler, dnorm(ruler), type = "l", xlab = "standard normal") (1-.95)/2 ## because this is two-tailed qnorm(1 - .025) qnorm(0 + .025) ruler <- seq(-4,4,.01) plot(ruler, dnorm(ruler), type = "l", xlab = "standard normal") abline(v = 1.96) abline(v = -1.96) ## KEY QUESTIONS: # 1. WHAT HAPPENS TO CONFIDENCE INTERVAL WHEN N GOES UP? # (RELATED: WHAT HAPPENS WHEN N<32)? # 2. WHAT HAPPENS TO CONFIDENCE INTERVAL WHEN STANDARD DEV # GOES UP? # 3. WHAT HAPPENS TO CONFLIDENCE INTERVAL WHEN CONFIDENCE LEVEL GOES UP? ## once the quantities are observed, nothing is random! ## explain. ########################### ## Example from the news ## ########################### ## WorldPublicOpinion.org (WPO) conducted the poll of ## 1,003 Iranians across Iran between Aug. 27 and Sept. 10, 2009. ## 642 said they had confidencein the president ## 211 said they did not have confidence in the president ## (150 said they had no opinion; let's ignore these) ## calculate 95% conf interval of support ## among those with an opinion iran <- matrix(data = NA, nrow = (642+211), ncol = 1) iran[1:642] <- 1 iran[643:length(iran)] <- 0 mean(iran) sd(iran) mean(iran) + 1.96*(sd(iran)/sqrt(length(iran))) mean(iran) - 1.96*(sd(iran)/sqrt(length(iran))) ## margin of error ## and margin of error ######################################### ## Conf Intervals w/ repeated sampling ## ######################################### ## We're going to pretend we have 20 independent researchers out in ## the field polling precincts in Fulton. Each of the 10 will ## poll 40 precincts and then calculate the mean and sd and 95% CI. set.seed(12345) ## again, remember seed setting! N <- 40 ## sample size pollsters <- 20 ## number of posters poll.means <- matrix(data = NA, ncol = 1, nrow = 10) poll.sds <- matrix(data = NA, ncol = 1, nrow = 10) ## holder matrixes for (i in 1:pollsters) { poll.surveyed <- precincts[sample(nrow(precincts), size=N, replace=FALSE),] poll.means[i] <- mean(poll.surveyed$black) poll.sds[i] <- sd(poll.surveyed$black) } ## Note that we didn't save the CI in each iteration. That's because ## we can create the CIs outside the loop much more quickly ci.upper <- poll.means + 1.96*poll.sds/sqrt(N) ci.lower <- poll.means - 1.96*poll.sds/sqrt(N) ## Let's plot the sampling distribution of the means plot(density(poll.means)) ## let's plot the sampling density of the CIs... ## this is tricky, but we'll get through it. First, we want to get comfortable ## with the "plot" command x <- c(1:10) ## just 1-10 y <- rnorm(length(x)) ## draws from a standard normal x y plot(x,y) plot(x,y, type = "l") ## lines plot(x,y, type = "b") ## both lines and dots plot(x,y, type = "s") ## stair steps ## back to the task at hand... ## we need to create ## a ruler that will denote the horizontal locations of the CIs ruler <- 1:pollsters ## Now, let's open up a plotting window and plot the means first, ## I'm using "pch=19" to get solid dots and xlim=c(0,1) because this ## is a proportion so I know things can't be bigger than that. plot(x=poll.means, y=ruler, pch=19, col="orange", xlim=c(0,1)) ## all "ruler" is at this point is an index indicating ## which poster reported which poll.mean ## The next plotting command is "segments" which will draw lines ## on the plot. We'll use this to plot the CIs. Note that the ## arguments are segments(x0,x1,y0,y1...), where ## (x0,y0) is the beginning point of the line ## (x1,y1) is the ending point of the line. ## Let's do a test: segments(x0=.2,y0=3, x1=.8,y1=20, col="orange") ## the starting point here was (.2, 3) ## and then ending point was (.8, 20) ## But that doesn't look too good, let's start over: plot(x=poll.means, y=ruler, pch=19, col="orange", xlim=c(0,1)) ## for each x0,x1,y0,y1, segments can take vectors, which draw the lines ## using the values with the same index. So we have: segments(x0 = ci.lower, x1 = ci.upper, y0 = ruler, y1 = ruler, col = "orange") ## In addition, let's draw a vertical line at the true mean. we'll use the ## abline(v=) command. abline(v=mean(precincts$black), col="red") ## we might want to see how many of the CIs contain the true mean mean((ci.upper > mean(precincts$black) & (ci.lower < mean(precincts$black)))) ######################## ## hypothesis testing ## ######################## ## ok, let's say a fancy pollster thinks that there's ## no way that the turnout rate was more than 35% ## Let's use hypothesis testing to test his conclusion ## STEP 1: take a sample ## why? otherwise, you just know the truth hypo.samp <- precincts[sample(nrow(precincts), 200),] ## Step 2: take the sample mean (e.g., the estimator you are interested in) ## and sample standard deviation samp.mean <- mean(hypo.samp$turnout) samp.sd <- sd(hypo.samp$turnout) ## Step 3: set up a test statistic ## it can be a lot of things, but generally people ## normalize the estimate to the standard normal and then ## check to see how unusual (or unusual) that test statistic would be test.stat <- (samp.mean - .35)/(samp.sd/sqrt(N)) ## step 4: calculate rejection regions ## so for alpha = 0.05 and 0.01: qnorm(1 - .025) ## for alpha = 0.05 (why??) qnorm(0 + .025) ## for alpha = 0.05 qnorm(1 - .005) ## for alpha = 0.01 (why??) qnorm(1 - .005) ## for alpha = 0.01 ## let's plot these against a standard normal to see what they look like ruler <- seq(-6,6,.01) plot(ruler, dnorm(ruler), type = "l", xlab = "standard normal") abline(v = c(qnorm(1 - .025), qnorm(0 + .025)), col = "red") abline(v = c(qnorm(1 - .005), qnorm(0 + .005)), col = "blue") abline(v = test.stat) legend(x = "topleft", legend = c("alpha = .05", "alpha = .01", "test stat"), lwd = 2, col = c("red", "blue", "black")) #cool!