this solution is recommended, credits to Jessica Lydia Moore

# Set-Up Data
candidate = c("Smith","Jones","Martinelli","Wagner","Others")
votes = c(442,208,460,180,205)
t(data.frame(candidate=candidate,votes=votes))

##           [,1]    [,2]    [,3]         [,4]     [,5]    
## candidate "Smith" "Jones" "Martinelli" "Wagner" "Others"
## votes     "442"   "208"   "460"        "180"    "205"

# Get Estimated Proportions
pm <- votes[3]/sum(votes)
ps <- votes[1]/sum(votes)

# Construct CI
point.est <- pm - ps
se <- sqrt((pm+ps-(pm-ps)^2)/sum(votes))
halfinterval <- qnorm(.975)*se
lb <- point.est - halfinterval
ub <- point.est + halfinterval
c(lb,ub)

## [1] -0.02732919  0.05140946

Since 0 is in the confidence interval, there is not a statistically significant difference between the two front-runners at the 0.05 level.

Exercise 1

We establish 95% confidence intervals for the top two candidates (Martinalli and Smith, respectively) by using bootstrap sampling. We then create density plots with lower and upper bounds to see how the voting distribution compares for the two candidates.

candidate <- c("Smith","Jones","Martinelli","Wagner","Others") 
votes <- c(442,208,460,180,205)
voting.data <- data.frame(candidate=candidate,votes=votes)
top.inds <- c(which(votes == max(votes)), which(votes == sort(votes, decreasing = TRUE)[2]))
top.runners <- voting.data[top.inds,]
top.runners <- cbind(top.runners, p = c(top.runners[,"votes"]/sum(votes)))

votes.boot <- c(rep(1, top.runners[1,"votes"]), rep(2, top.runners[2,"votes"]), rep(3, sum(votes) - sum(top.runners[,"votes"])) )
B <- 10000
vote.bootstrap.sample <- function() {
  s <- sample(votes.boot, replace = TRUE)
  s.1 <- length(which(s == 1)) / length(s)
  s.2 <- length(which(s == 2)) / length(s)
  return(c(s.1, s.2))
}
theta.bootstrap <- replicate(B, vote.bootstrap.sample())

m <- 0.05/2*B
CI.M.lower <- sort(theta.bootstrap[1,])[m] 
CI.M.upper <- sort(theta.bootstrap[1,])[B-m]
CI.S.lower <- sort(theta.bootstrap[2,])[m] 
CI.S.upper <- sort(theta.bootstrap[2,])[B-m]

Martinelli <- data.frame(theta.bs = theta.bootstrap[1,], name = "Martinelli", CI.lower = CI.M.lower, CI.upper = CI.M.upper)
Smith <- data.frame(theta.bs = theta.bootstrap[2,], name = "Smith", CI.lower = CI.S.lower, CI.upper = CI.S.upper)
votes.bs <- rbind(Martinelli, Smith)

library(ggplot2)
ggplot(votes.bs, aes(theta.bs, fill = name)) + geom_density(alpha = 0.2) + geom_vline(xintercept = c(CI.M.lower, CI.M.upper), col = "red") + geom_vline(xintercept = c(CI.S.lower, CI.S.upper), col = "blue")

We see from the density plots and the bounds for each candidate that there is considerable overlap of the two distributions, which means that there is no significant difference between the two front runners.

Exercise 2

n <- 500
set.seed(0)
x <- rpois(n, 3)
x[x >= 8] <- 7
hist(x, breaks = seq(-0.5,7.5,by=1))

range <- 0:7
oc <- apply(array(range), 1, FUN = function(y) length(which(x == y)))
p.hat <- sum(range*oc) / (n * 7)
pmf <- dbinom(range, size = 7, prob = p.hat)
rbind(range, round(pmf, 3))

##       [,1]  [,2]  [,3]  [,4] [,5]  [,6]  [,7]  [,8]
## range 0.00 1.000 2.000 3.000 4.00 5.000 6.000 7.000
##       0.02 0.105 0.235 0.294 0.22 0.099 0.025 0.003

test.result <- chisq.test(oc, p = pmf)

## Warning in chisq.test(oc, p = pmf): Chi-squared approximation may be
## incorrect

pchisq(test.result$statistic, df = 6, lower.tail = FALSE)

##    X-squared 
## 1.258542e-47

With a p-value of 1.26e-47, we reject the null hypothesis that the sample has a binomial distribution with n=7.

Exercise 3

CI.independence <- function(C, diff = FALSE, alpha = 0.05) {
  n <- sum(C)
  p1 <- C[1,1] / n
  df <- (ncol(C) - 1) * (nrow(C) - 1)
  z.alpha <- qnorm(1 - alpha/2)
  if(diff) {
    p2 <- C[1,2] / n
    p12 <- p1 - p2
    diff <- z.alpha * sqrt((p1 + p2 - (p12)^2) / n)
    return(c(p12 - diff, p12 + diff))
  } else {
    diff <- z.alpha * sqrt((p1 * (1 - p1)) / n)
    return(c(p1 - diff, p1 + diff))
  }
}

CI.homogeneity <- function(C, diff = FALSE, alpha = 0.05) {
  n <- sum(C)
  n1 <- sum(C[1,])
  p1 <- C[1,1] / n1
  df <- (ncol(C) - 1) * (nrow(C) - 1)
  z.alpha <- qnorm(1 - alpha/2)
  if(diff) {
    p2 <- C[1,2] / n1
    p12 <- p1 - p2
    diff <- z.alpha * sqrt(p1*(1-p1)/n1 + p2*(1-p2)/n1)
    return(c((p12 - diff)*n1/n, (p12 + diff)*n1/n))
  } else {
    diff <- z.alpha * sqrt((p1 * (1 - p1)) / n1)
    return(c((p1 - diff)*n1/n, (p1 + diff)*n1/n))
  }
}

test.contingency <- matrix(round(runif(21, min=1, max=20)), nrow=7, ncol=3)

# homogeneity
CI.homogeneity(test.contingency, diff = FALSE)

## [1] -0.002706394  0.019032925

# independence
CI.independence(test.contingency, diff = FALSE)

## [1] -0.003103963  0.019430494

We see that the test for homogeneity has a slightly more confined 95% confidence interval for p11 than the test for independence.

# homogeneity
CI.homogeneity(test.contingency, diff = TRUE)

## [1] -0.050814429 -0.006328429

# independence
CI.independence(test.contingency, diff = TRUE)

## [1] -0.054861628 -0.002281229

Again, we see that the test for homogeneity has a slightly more confined 95% confidence interval for p11-p12 than the test for independence.

Exercise 4

library(languageR) 
library(vcd)

## Loading required package: grid

library(ca)
if(!file.exists('wordcounts.RData')) {
  data(alice, moby, oz) 
  words <- unique(c(unique(alice), unique(moby), unique(oz)))
  word.counts <- apply(array(words), 1, FUN = function(x) {
    c1 <- length(which(alice == x))
    c2 <- length(which(moby == x))
    c3 <- length(which(oz == x))
    return(c(c1, c2, c3))
  })
  save(words, file = 'words.RData')
  save(word.counts, file = 'wordcounts.RData')
} else {
  load('words.RData')
  load('wordcounts.RData')
}

c.lim <- 200
low.ind <- unique(c(which(word.counts[1,] < c.lim), which(word.counts[2,] < c.lim), which(word.counts[3,] < c.lim)))
wc.final <- word.counts[,-low.ind]
words.final <- words[-low.ind]
rownames(wc.final) <- c("alice", "moby", "oz")
colnames(wc.final) <- words.final
wc.final

##          I   the  was   to   of her  and   it   in    a   as that   at
## alice  545  1522  352  721  497 243  796  527  354  614  246  275  202
## moby  2122 13717 1632 4514 6512 329 6008 2209 3908 4551 1620 2982 1230
## oz     647  2745  499 1098  820 402 1592  353  463  795  303  366  233
##       said you
## alice  456 345
## moby   302 839
## oz     332 448

mosaic(wc.final, shade = TRUE, legend = TRUE)

assoc(wc.final, shade = TRUE, legend = TRUE)

fit <- ca(wc.final)
plot(fit) # symmetric map

plot(fit, mass = TRUE, contrib = "absolute", map = "rowgreen", arrows = c(FALSE, TRUE)) # asymmetric map

We see that the words that contribute the most to inhomogeneity between the books are: ‘said’, ‘her’, ‘you’, ‘of’, and ‘in’. We see this represented in the dimensions and Pearson residuals for the rectangles corresponding with these words on the mosaic and association plots, as well as the placement of these words, especially along the x-axis (which accounts for 89.4% of the variation), on the two correspondence analysis graphs.

STATS 205 - HW3

Jonathan Fisk

April 18, 2016

Exercise 1

this solution is recommended, credits to Jessica Lydia Moore

Exercise 1

Exercise 2

Exercise 3

Exercise 4