## Section 3 R Notes
## Gov2000 Section Notes
## September 24, 2009
## Maya Sen
## Confidence Intervals and Hypothesis Testing
## Thanks to Matt Blackwell, last year's TF for data and code

#####################################
## converting to a standard normal ##
#####################################

## a key concept at work in everything we talked about this week is 
## that you can transform any normal back to a standard normal,
## which has a mean of 0 and a standard deviation of 1.

## see explanation on the board?

## by substracting the mean and dividing by the standard deviation:

samp <- rnorm(1000, mean = 69.1, sd = 2.9)	## men's heights in the US
hist(samp)							

transformed.samp <- (samp - 69.1)/2.9		## substracting mean, divindg by sd
hist(transformed.samp)

## putting those on the same line
par(mfrow = c(1,2))
hist(samp)
hist(transformed.samp)

#################################
## Let's play around with data ##
#################################

load("fulton.RData")

## Whenever you load up a dataset, but sure to play around
## with it for a bit. You should get to know how big it is,
## what the columns are and whatnot.

## get the column names of the data:
names(precincts)

## get the first six rows of the data:
head(precincts)  

## get the dimensions of the data:
dim(precincts)

## This data is election data from Georgia. This
## data (precincts) is data aggregated to the precinct level.
## So this measures the turnout rate, percent Black, percent
## Female, mean age, turnout in the democratic primary,
## turnout in the republican primary, is the precinct in Atlanta,
## and location dummies for the polling stations.

## Our goal in section today will be to estimate the percent
## Black in Fulton county. 

## this is what that variable looks like:

plot(density(precincts$black))
	## anything weird about this??

###########################
## Calculating True Mean ##
###########################

##To do this, we'll take the mean of
## percent Black in each of the precincts.

## Note: a really quick and easy way to calculate proportions
## is to use the mean function:

true.mean <- mean(precincts$black)
true.mean
## this is the TRUE population

## we can also use the mean function to calculate other 
## statistics, even those involving binary variables

mean(precincts$school)
	## proportion of precincts where people
	## vote at a school
mean(precincts$church)
	## at a church
mean(precincts$firest)
	## at a fire station

##############
## Sampling ##
##############

## remember that we can use indexing to get certain rows
## of the data. If I wanted to get just rows 10, 3 and 200
## in that order, I would say...

precincts[c(10,3,200),]

## What if we want to do a random sample of the datasets?
##We'll use the "sample" command from previous weeks to give
##us a random sample of the row numbers.

## First, ALWAYS check if you have to set the seed!

set.seed(12345)
	## setting seed to 12345

## Now...
n.precincts <- nrow(precincts)
	## number of rows
N <- 40
	## sample size

rand.rows <- sample(n.precincts, size=N, replace=FALSE)
rand.rows

## Now that we have the random rows, we can grab those random rows
## from the actual data and put it into  a new matrix:

mypoll <- precincts[rand.rows,]

## or we can put everything on one line

mypoll <- precincts[sample(nrow(precincts), size = N, replace = FALSE),]

## and we can treat this as a nice little subset of the data
summary(mypoll)
head(mypoll)
dim(mypoll)

####################
## Conf Intervals ##
####################

## recall the formula for the 95% CI
## let's calcuate the mean, standard deviation and 95% CI for this sample

mu.b <- mean(mypoll$black)
sd.b <-sd(mypoll$black)

mu.b
sd.b

## we'll just use the formulas for confidence intervals

mu.b + 1.96*sd.b/sqrt(N)
mu.b - 1.96*sd.b/sqrt(N)

## where does the 1.96 come from?
		## (draw on board.)

par(mfrow = c(1,1))
ruler <- seq(-4,4,.01)
plot(ruler, dnorm(ruler), type = "l", xlab = "standard normal")

(1-.95)/2 ## because this is two-tailed
qnorm(1 - .025)
qnorm(0 + .025)

ruler <- seq(-4,4,.01)
plot(ruler, dnorm(ruler), type = "l", xlab = "standard normal")
abline(v = 1.96)
abline(v = -1.96)

## KEY QUESTIONS:
# 1. WHAT HAPPENS TO CONFIDENCE INTERVAL WHEN N GOES UP?
# (RELATED: WHAT HAPPENS WHEN N<32)?
# 2. WHAT HAPPENS TO CONFIDENCE INTERVAL WHEN STANDARD DEV
#	GOES UP?
# 3. WHAT HAPPENS TO CONFLIDENCE INTERVAL WHEN CONFIDENCE LEVEL GOES UP?

## once the quantities are observed, nothing is random!
## explain.

###########################
## Example from the news ##
###########################

## WorldPublicOpinion.org (WPO) conducted the poll of 
## 1,003 Iranians across Iran between Aug. 27 and Sept. 10, 2009. 
## 642 said they had confidencein the president
## 211 said they did not have confidence in the president
## (150 said they had no opinion; let's ignore these)

## calculate 95% conf interval of support
## among those with an opinion

iran <- matrix(data = NA, nrow = (642+211), ncol = 1)
iran[1:642] <- 1
iran[643:length(iran)] <- 0

mean(iran)
sd(iran)

mean(iran) + 1.96*(sd(iran)/sqrt(length(iran)))
mean(iran) - 1.96*(sd(iran)/sqrt(length(iran)))

## margin of error

## and margin of error

#########################################
## Conf Intervals w/ repeated sampling ##
#########################################

## We're going to pretend we have 20 independent researchers out in
## the field polling precincts in Fulton. Each of the 10 will
## poll 40 precincts and then calculate the mean and sd and 95% CI.

set.seed(12345)		## again, remember seed setting!
N <- 40			## sample size
pollsters <- 20		## number of posters
poll.means <- matrix(data = NA, ncol = 1, nrow = 10)
poll.sds   <- matrix(data = NA, ncol = 1, nrow = 10)
				## holder matrixes

for (i in 1:pollsters) {

  poll.surveyed <- precincts[sample(nrow(precincts), size=N, replace=FALSE),]

  poll.means[i] <- mean(poll.surveyed$black)
  poll.sds[i]   <-   sd(poll.surveyed$black)

}

## Note that we didn't save the CI in each iteration. That's because
## we can create the CIs outside the loop much more quickly

ci.upper <- poll.means + 1.96*poll.sds/sqrt(N)
ci.lower <- poll.means - 1.96*poll.sds/sqrt(N)

## Let's plot the sampling distribution of the means

plot(density(poll.means))

## let's plot the sampling density of the CIs...
## this is tricky, but we'll get through it. First, we want to get comfortable
## with the "plot" command

x <- c(1:10)			## just 1-10
y <- rnorm(length(x))		## draws from a standard normal
x
y

plot(x,y)
plot(x,y, type = "l")		## lines
plot(x,y, type = "b")		## both lines and dots
plot(x,y, type = "s")		## stair steps

## back to the task at hand...
## we need to create
## a ruler that will denote the horizontal locations of the CIs

ruler <- 1:pollsters

## Now, let's open up a plotting window and plot the means first,
## I'm using "pch=19" to get solid dots and xlim=c(0,1) because this
## is a proportion so I know things can't be bigger than that. 

plot(x=poll.means, y=ruler, pch=19, col="orange", xlim=c(0,1))
	## all "ruler" is at this point is an index indicating
	## which poster reported which poll.mean

## The next plotting command is "segments" which will draw lines
## on the plot. We'll use this to plot the CIs. Note that the
## arguments are segments(x0,x1,y0,y1...), where
##     (x0,y0) is the beginning point of the line
##     (x1,y1) is the ending point of the line.
## Let's do a test: 

segments(x0=.2,y0=3,
         x1=.8,y1=20, col="orange")

		## the starting point here was (.2, 3)
		## and then ending point was (.8, 20)

## But that doesn't look too good, let's start over:

plot(x=poll.means, y=ruler, pch=19, col="orange", xlim=c(0,1))

## for each x0,x1,y0,y1, segments can take vectors, which draw the lines
## using the values with the same index. So we have:

segments(x0 = ci.lower, x1 = ci.upper, y0 = ruler, y1 = ruler, col = "orange")

## In addition, let's draw a vertical line at the true mean. we'll use the
## abline(v=) command.

abline(v=mean(precincts$black), col="red")

## we might want to see how many of the CIs contain the true mean

mean((ci.upper > mean(precincts$black) & (ci.lower < mean(precincts$black))))

########################
## hypothesis testing ##
########################

## ok, let's say a fancy pollster thinks that there's
## no way that the turnout rate was more than 35%

## Let's use hypothesis testing to test his conclusion

## STEP 1: take a sample
	## why? otherwise, you just know the truth

hypo.samp <- precincts[sample(nrow(precincts), 200),]

## Step 2: take the sample mean (e.g., the estimator you are interested in)
##  and sample standard deviation

samp.mean <- mean(hypo.samp$turnout)
samp.sd <- sd(hypo.samp$turnout)

## Step 3: set up a test statistic
	## it can be a lot of things, but generally people
	## normalize the estimate to the standard normal and then
	## check to see how unusual (or unusual) that test statistic would be

test.stat <- (samp.mean - .35)/(samp.sd/sqrt(N))

## step 4: calculate rejection regions

## so for alpha = 0.05 and 0.01:

qnorm(1 - .025)	## for alpha = 0.05 (why??)
qnorm(0 + .025)	## for alpha = 0.05 

qnorm(1 - .005)	## for alpha = 0.01 (why??)
qnorm(1 - .005)	## for alpha = 0.01 

## let's plot these against a standard normal to see what they look like

ruler <- seq(-6,6,.01)
plot(ruler, dnorm(ruler), type = "l", xlab = "standard normal")
abline(v = c(qnorm(1 - .025), qnorm(0 + .025)), col = "red")
abline(v = c(qnorm(1 - .005), qnorm(0 + .005)), col = "blue")
abline(v = test.stat)
legend(x = "topleft", legend = c("alpha = .05", "alpha = .01", "test stat"), lwd = 2, col =
c("red", "blue", "black"))

#cool!