We are going to look at a modern application where we encouter two sample problems very frequently. In neuroscience, we often compare two populations, such as people who are autistic and healthy controls, in the hope of finding morphological, connectivity, and functional difference between the two groups that could inform new therapies and cures.
Our running example will directly work on preprocessed neuroimaging data from the Autism Brain Imaging Data Exchange (ABIDE). The data is openly avaialbe on the ABIDE website here).
ABIDE is a collaboration of 16 international imaging sites that have aggregated and are openly sharing neuroimaging data from 539 individuals suffering from ASD and 573 typical controls. For now will only use a subset of their data to speedup our analysis and foucs on 40 paricipants.
First, we load all images and store them in a big matrix.
library(SimpleITK)
library(oro.nifti)
## oro.nifti 0.5.2
makeImage = function(vecImg, refImg) {
arrImg = array(vecImg, dim = refImg$GetSize())
newImg = as.image(arr = arrImg,spacing = refImg$GetSpacing(),origin = refImg$GetOrigin())
newImg$CopyInformation(refImg)
return(newImg)
}
fileNames = list.files(pattern = "_anat_thickness.nii.gz")
nBrains = length(fileNames)
img = ReadImage(fileNames[1])
orthographic(as.array(img))
brainList = lapply(1:nBrains, function(i) c(as.array(ReadImage(fileNames[i]))))
brainMatrix = do.call(rbind, brainList)
Then, we compute the image mask. In this example, the mask is defined as the voxel positions that are bigger than zero for all participants.
minThickness = apply(brainMatrix, 2, min)
mask = minThickness > 0
maskImg = makeImage(mask,img)
WriteImage(maskImg,"mask.nii.gz")
## NULL
orthographic(as.array(maskImg))
Split the data in two groups: autistic (1) and healthy participants (2).
ParticipantAndImageInformation <- read.csv("Phenotypic_V1_0b_preprocessed1.csv", header=TRUE)
copiedFilesInd = ParticipantAndImageInformation$FILE_ID %in% substr(fileNames,1,16)
copiedFilesGroup = ParticipantAndImageInformation$DX_GROUP[copiedFilesInd]
ThicknessAutistic = brainMatrix[copiedFilesGroup==1,mask==1]
ThicknessHealthy = brainMatrix[copiedFilesGroup==2,mask==1]
Peform voxelwise nonparametric test Wilcoxon two-sample rank test.
library(ggplot2)
uncorrectedPValues = sapply(1:dim(ThicknessAutistic)[2],
function(i) wilcox.test(ThicknessAutistic[,i],ThicknessHealthy[,i],alternative = "two.sided")$p.value)
uncorrectedPValuesVec = rep(1,length(mask))
uncorrectedPValuesVec[mask==1] = uncorrectedPValues
uncorrectedPValuesImg = makeImage(uncorrectedPValuesVec,img)
WriteImage(uncorrectedPValuesImg,"uncorrectedPValueImage.nii.gz")
## NULL
orthographic(as.array(uncorrectedPValuesImg))
# significance levels
primary = 0.001
primary
## [1] 0.001
alpha = 0.05
alpha
## [1] 0.05
# maximum cluster size
thresholdedImg = BinaryThreshold(uncorrectedPValuesImg,
lowerThreshold=0,
upperThreshold=primary,
insideValue=1,outsideValue=0)
thresholdedSegmentedImg = RelabelComponent(ConnectedComponent(thresholdedImg))
thresholdedSegmentedArr = as.array(thresholdedSegmentedImg)
obsvComps = data.frame(Count=table(thresholdedSegmentedArr[thresholdedSegmentedArr > 0]))
head(obsvComps)
## Count.Var1 Count.Freq
## 1 1 77
## 2 2 72
## 3 3 60
## 4 4 44
## 5 5 39
## 6 6 39
orthographic(as.array(thresholdedImg))
labels = floor(seq(1,length(obsvComps$Count.Var1),length.out=20))
ggplot(obsvComps, aes(x = Count.Var1, y = Count.Freq)) +
geom_bar(stat="identity") +
scale_x_discrete(breaks=labels, labels=as.character(labels))
The problem is that we performed many statistical tests and if we declare all the voxels at the usual significance level as significant, we will report many random results as significant. We can use another nonparametric approach called permutations to handle this. This will be treated in the next lecture.
This method is describe here.
library(foreach)
library(doMC)
## Loading required package: iterators
## Loading required package: parallel
cores = 16
registerDoMC(cores)
seed = 1234
set.seed(seed)
nperm = 1000
ThicknessCombinded = brainMatrix[,mask==1]
maxClusterFileName = paste0("maxCluster.seed",seed,".Rdata")
if(!file.exists(maxClusterFileName)) {
ptm = proc.time()
sink("log-permutations.txt")
maxValues = foreach(i = 1:nperm,.combine=rbind) %dopar% {
print(paste0("GroupAssignments no. ",i))
GroupAssignments = sample(copiedFilesGroup)
ThicknessGroupA = ThicknessCombinded[GroupAssignments==1,]
ThicknessGroupB = ThicknessCombinded[GroupAssignments==2,]
uncorrectedPValues = sapply(1:dim(ThicknessGroupA)[2],
function(i) wilcox.test(ThicknessGroupA[,i],ThicknessGroupB[,i],alternative = "two.sided")$p.value)
# maximum cluster size
uncorrectedPValuesVec = rep(1,length(mask))
uncorrectedPValuesVec[mask==1] = uncorrectedPValues
uncorrectedPValuesImg = makeImage(uncorrectedPValuesVec,img)
thresholdedImg = BinaryThreshold(uncorrectedPValuesImg,
lowerThreshold=0,
upperThreshold=primary,
insideValue=1,outsideValue=0)
thresholdedSegmentedImg = RelabelComponent(ConnectedComponent(thresholdedImg))
thresholdedSegmentedArr = as.array(thresholdedSegmentedImg)
permComps = data.frame(Count=table(thresholdedSegmentedArr[thresholdedSegmentedArr > 0]))
max(permComps$Count.Freq)
}
maxCluster = simplify2array(maxValues)
save(maxCluster,file = maxClusterFileName)
sink()
proc.time() - ptm
} else {
load(maxClusterFileName)
}
Plotting the null distribution of maximum cluster sizes.
alphaLargestInd = ceiling(alpha*nperm)
alphaLargest = sort(maxCluster,decreasing = TRUE)[alphaLargestInd]
maxCluster2 = data.frame(maxCluster=maxCluster)
ggplot(maxCluster2, aes(maxCluster)) +
geom_histogram(binwidth = 5) +
geom_vline(xintercept = alphaLargest,colour = "red",size = 1)
labels = floor(seq(1,length(obsvComps$Count.Var1),length.out=20))
ggplot(obsvComps, aes(x = Count.Var1, y = Count.Freq)) +
geom_bar(stat="identity") +
scale_x_discrete(breaks=labels, labels=as.character(labels)) +
geom_hline(yintercept = alphaLargest,colour = "red",size = 1)
All cluster above the right horizontal line in the above plot are declared as significant. Now we can go back to the image and color all significant voxel clusters.
nSignificantClusters = sum(obsvComps$Count.Freq >= alphaLargest)
nSignificantClusters
## [1] 0
# only keep significant clusters
thresholdedSegmentedArrSignificant = thresholdedSegmentedArr
thresholdedSegmentedArrSignificant[nSignificantClusters < thresholdedSegmentedArr] = 0
WriteImage(makeImage(thresholdedSegmentedArrSignificant,img),"significant.clusters.nii.gz")
## NULL