# Set-Up Data
candidate = c("Smith","Jones","Martinelli","Wagner","Others")
votes = c(442,208,460,180,205)
t(data.frame(candidate=candidate,votes=votes))
## [,1] [,2] [,3] [,4] [,5]
## candidate "Smith" "Jones" "Martinelli" "Wagner" "Others"
## votes "442" "208" "460" "180" "205"
# Get Estimated Proportions
pm <- votes[3]/sum(votes)
ps <- votes[1]/sum(votes)
# Construct CI
point.est <- pm - ps
se <- sqrt((pm+ps-(pm-ps)^2)/sum(votes))
halfinterval <- qnorm(.975)*se
lb <- point.est - halfinterval
ub <- point.est + halfinterval
c(lb,ub)
## [1] -0.02732919 0.05140946
Since 0 is in the confidence interval, there is not a statistically significant difference between the two front-runners at the 0.05 level.
We establish 95% confidence intervals for the top two candidates (Martinalli and Smith, respectively) by using bootstrap sampling. We then create density plots with lower and upper bounds to see how the voting distribution compares for the two candidates.
candidate <- c("Smith","Jones","Martinelli","Wagner","Others")
votes <- c(442,208,460,180,205)
voting.data <- data.frame(candidate=candidate,votes=votes)
top.inds <- c(which(votes == max(votes)), which(votes == sort(votes, decreasing = TRUE)[2]))
top.runners <- voting.data[top.inds,]
top.runners <- cbind(top.runners, p = c(top.runners[,"votes"]/sum(votes)))
votes.boot <- c(rep(1, top.runners[1,"votes"]), rep(2, top.runners[2,"votes"]), rep(3, sum(votes) - sum(top.runners[,"votes"])) )
B <- 10000
vote.bootstrap.sample <- function() {
s <- sample(votes.boot, replace = TRUE)
s.1 <- length(which(s == 1)) / length(s)
s.2 <- length(which(s == 2)) / length(s)
return(c(s.1, s.2))
}
theta.bootstrap <- replicate(B, vote.bootstrap.sample())
m <- 0.05/2*B
CI.M.lower <- sort(theta.bootstrap[1,])[m]
CI.M.upper <- sort(theta.bootstrap[1,])[B-m]
CI.S.lower <- sort(theta.bootstrap[2,])[m]
CI.S.upper <- sort(theta.bootstrap[2,])[B-m]
Martinelli <- data.frame(theta.bs = theta.bootstrap[1,], name = "Martinelli", CI.lower = CI.M.lower, CI.upper = CI.M.upper)
Smith <- data.frame(theta.bs = theta.bootstrap[2,], name = "Smith", CI.lower = CI.S.lower, CI.upper = CI.S.upper)
votes.bs <- rbind(Martinelli, Smith)
library(ggplot2)
ggplot(votes.bs, aes(theta.bs, fill = name)) + geom_density(alpha = 0.2) + geom_vline(xintercept = c(CI.M.lower, CI.M.upper), col = "red") + geom_vline(xintercept = c(CI.S.lower, CI.S.upper), col = "blue")
We see from the density plots and the bounds for each candidate that there is considerable overlap of the two distributions, which means that there is no significant difference between the two front runners.
n <- 500
set.seed(0)
x <- rpois(n, 3)
x[x >= 8] <- 7
hist(x, breaks = seq(-0.5,7.5,by=1))
range <- 0:7
oc <- apply(array(range), 1, FUN = function(y) length(which(x == y)))
p.hat <- sum(range*oc) / (n * 7)
pmf <- dbinom(range, size = 7, prob = p.hat)
rbind(range, round(pmf, 3))
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## range 0.00 1.000 2.000 3.000 4.00 5.000 6.000 7.000
## 0.02 0.105 0.235 0.294 0.22 0.099 0.025 0.003
test.result <- chisq.test(oc, p = pmf)
## Warning in chisq.test(oc, p = pmf): Chi-squared approximation may be
## incorrect
pchisq(test.result$statistic, df = 6, lower.tail = FALSE)
## X-squared
## 1.258542e-47
With a p-value of 1.26e-47, we reject the null hypothesis that the sample has a binomial distribution with n=7.
CI.independence <- function(C, diff = FALSE, alpha = 0.05) {
n <- sum(C)
p1 <- C[1,1] / n
df <- (ncol(C) - 1) * (nrow(C) - 1)
z.alpha <- qnorm(1 - alpha/2)
if(diff) {
p2 <- C[1,2] / n
p12 <- p1 - p2
diff <- z.alpha * sqrt((p1 + p2 - (p12)^2) / n)
return(c(p12 - diff, p12 + diff))
} else {
diff <- z.alpha * sqrt((p1 * (1 - p1)) / n)
return(c(p1 - diff, p1 + diff))
}
}
CI.homogeneity <- function(C, diff = FALSE, alpha = 0.05) {
n <- sum(C)
n1 <- sum(C[1,])
p1 <- C[1,1] / n1
df <- (ncol(C) - 1) * (nrow(C) - 1)
z.alpha <- qnorm(1 - alpha/2)
if(diff) {
p2 <- C[1,2] / n1
p12 <- p1 - p2
diff <- z.alpha * sqrt(p1*(1-p1)/n1 + p2*(1-p2)/n1)
return(c((p12 - diff)*n1/n, (p12 + diff)*n1/n))
} else {
diff <- z.alpha * sqrt((p1 * (1 - p1)) / n1)
return(c((p1 - diff)*n1/n, (p1 + diff)*n1/n))
}
}
test.contingency <- matrix(round(runif(21, min=1, max=20)), nrow=7, ncol=3)
# homogeneity
CI.homogeneity(test.contingency, diff = FALSE)
## [1] -0.002706394 0.019032925
# independence
CI.independence(test.contingency, diff = FALSE)
## [1] -0.003103963 0.019430494
We see that the test for homogeneity has a slightly more confined 95% confidence interval for p11 than the test for independence.
# homogeneity
CI.homogeneity(test.contingency, diff = TRUE)
## [1] -0.050814429 -0.006328429
# independence
CI.independence(test.contingency, diff = TRUE)
## [1] -0.054861628 -0.002281229
Again, we see that the test for homogeneity has a slightly more confined 95% confidence interval for p11-p12 than the test for independence.
library(languageR)
library(vcd)
## Loading required package: grid
library(ca)
if(!file.exists('wordcounts.RData')) {
data(alice, moby, oz)
words <- unique(c(unique(alice), unique(moby), unique(oz)))
word.counts <- apply(array(words), 1, FUN = function(x) {
c1 <- length(which(alice == x))
c2 <- length(which(moby == x))
c3 <- length(which(oz == x))
return(c(c1, c2, c3))
})
save(words, file = 'words.RData')
save(word.counts, file = 'wordcounts.RData')
} else {
load('words.RData')
load('wordcounts.RData')
}
c.lim <- 200
low.ind <- unique(c(which(word.counts[1,] < c.lim), which(word.counts[2,] < c.lim), which(word.counts[3,] < c.lim)))
wc.final <- word.counts[,-low.ind]
words.final <- words[-low.ind]
rownames(wc.final) <- c("alice", "moby", "oz")
colnames(wc.final) <- words.final
wc.final
## I the was to of her and it in a as that at
## alice 545 1522 352 721 497 243 796 527 354 614 246 275 202
## moby 2122 13717 1632 4514 6512 329 6008 2209 3908 4551 1620 2982 1230
## oz 647 2745 499 1098 820 402 1592 353 463 795 303 366 233
## said you
## alice 456 345
## moby 302 839
## oz 332 448
mosaic(wc.final, shade = TRUE, legend = TRUE)
assoc(wc.final, shade = TRUE, legend = TRUE)
fit <- ca(wc.final)
plot(fit) # symmetric map
plot(fit, mass = TRUE, contrib = "absolute", map = "rowgreen", arrows = c(FALSE, TRUE)) # asymmetric map
We see that the words that contribute the most to inhomogeneity between the books are: ‘said’, ‘her’, ‘you’, ‘of’, and ‘in’. We see this represented in the dimensions and Pearson residuals for the rectangles corresponding with these words on the mosaic and association plots, as well as the placement of these words, especially along the x-axis (which accounts for 89.4% of the variation), on the two correspondence analysis graphs.