Introduction

This document contains code used for the Flow-Through Centrality project, to test whether which method of computing centrality is the most stable. The R package coin is used to to obtain permutation-based tests of equality of variance. We also need the R package ``tidyverse’’ to massage the data into the appropriate format for hypothesis testing. The references are given below.

Torsten Hothorn, Kurt Hornik, Mark A. van de Wiel, Achim Zeileis (2008). Implementing a Class of Permutation Tests: The coin Package. Journal of Statistical Software 28(8), 1-23. URL http://www.jstatsoft.org/v28/i08/

R Core Team (2018). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/ (version 3.5.2).

Hadley Wickham (2017). tidyverse: Easily Install and Load the ‘Tidyverse’. R package version 1.2.1. https://CRAN.R-project.org/package=tidyverse

The functions we use from the coin package are fligner_test for the main hypothesis test. The data need to be in long format and balanced across the groups.

The simulated data are in “~/Dropbox/2019 Spring/FlowThroughCentrality/Simulations”. This file combines the results of where the number of nodes is 50, the average degree is 6, and the perturbations are either 3 percent, 6 percent, 9 percent, or 12 percent. A table containing all results can be found at the end of this document.

setwd("~/Dropbox/2019 Spring/FlowThroughCentrality/Simulations/Nodes50Deg6Pert3")
#install.packages("coin") - use if you have not installed the package.
library(coin) # load the package in R 
## Loading required package: survival
suppressPackageStartupMessages(library(tidyverse)) # for tidying the data
library(xtable) # for obtaining tables in LaTex format

50 Nodes with Six degrees and 3% perturbation

# Create variable names for all data sets
varNames <- c("Network",paste0(c("Cent","Tier"), rep(1:50,each=2)))
dataBW <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert03_Betweenness.csv",skip=5,nrows=100,header=FALSE)
colnames(dataBW) <- varNames
dataBW$Measure <- rep("BW",100)
dataCL <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert03_Closeness.csv",skip=5,nrows=100,header=FALSE)
colnames(dataCL) <- varNames
dataCL$Measure <- rep("CL",100)
dataFB <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert03_FlowBetween.csv",skip=5,nrows=100,header=FALSE)
colnames(dataFB) <- varNames
dataFB$Measure <- rep("FB",100)
dataFT <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert03_Flowthrough.csv",skip=5,nrows=100,header=FALSE)
colnames(dataFT) <- varNames
dataFT$Measure <- rep("FT",100)
dataSB <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert03_StableBetween.csv",skip=5,nrows=100,header=FALSE)
colnames(dataSB) <- varNames
dataSB$Measure <- rep("SB",100)

## Create tibbles with first six columns of each file
dataBWfirst6 <- dplyr::select(dataBW,c(Network:Tier6,Measure))
dataCLfirst6 <- dplyr::select(dataCL,c(Network:Tier6,Measure))
dataFBfirst6 <- dplyr::select(dataFB,c(Network:Tier6,Measure))
dataFTfirst6 <- dplyr::select(dataFT,c(Network:Tier6,Measure))
dataSBfirst6 <- dplyr::select(dataSB,c(Network:Tier6,Measure))
dataAllFirst6 <- bind_rows(dataBWfirst6,dataCLfirst6,dataFBfirst6,dataFTfirst6, dataSBfirst6)
dataAllFirst6$Measure <- as.factor(dataAllFirst6$Measure)

## Data in long format
tier6n50deg6pert3a <- gather(dataAllFirst6,key=CentLevel,value=Centrality,Cent1,Cent2,Cent3,Cent4,Cent5,Cent6,factor_key=TRUE) 
tier6n50deg6pert3b <- gather(dataAllFirst6,key=Level,value=Tier, Tier1,Tier2,Tier3,Tier4,Tier5,Tier6, factor_key=TRUE)
tier6n50deg6pert3 <-merge(tier6n50deg6pert3a,tier6n50deg6pert3b,by=c("Network","Measure"))
tier6n50deg6pert3 <- separate(tier6n50deg6pert3,Network,into=c("Trash","Sample"),sep=" ")
tier6n50deg6pert3$Sample <- as.integer(tier6n50deg6pert3$Sample)
tier6n50deg6pert3 <- tier6n50deg6pert3[,-1]

pdf(file="tier6n50deg6pert3Lines.pdf")
ggplot(tier6n50deg6pert3,aes(x=Sample, y=Centrality, colour=CentLevel)) + geom_line() + facet_grid(Measure~.,scales="free")
dev.off()
## quartz_off_screen 
##                 2

50 Nodes with Six degrees and 6% perturbation

Although there are many more tiers than 6, we stop at 6 for the sake of brevity. Furthermore, results become unpredictable after 6 tiers.

setwd("../Nodes50Deg6Pert6")
# Create variable names for all data sets
varNames <- c("Network",paste0(c("Cent","Tier"), rep(1:50,each=2)))
dataBW <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert06_Betweenness.csv",skip=5,nrows=100,header=FALSE)
colnames(dataBW) <- varNames
dataBW$Measure <- rep("BW",100)
dataCL <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert06_Closeness.csv",skip=5,nrows=100,header=FALSE)
colnames(dataCL) <- varNames
dataCL$Measure <- rep("CL",100)
dataFB <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert06_FlowBetween.csv",skip=5,nrows=100,header=FALSE)
colnames(dataFB) <- varNames
dataFB$Measure <- rep("FB",100)
dataFT <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert06_Flowthrough.csv",skip=5,nrows=100,header=FALSE)
colnames(dataFT) <- varNames
dataFT$Measure <- rep("FT",100)
dataSB <- read.csv(file="Nodes50_AvgDeg06_UW_01_Pert06_StableBetween.csv",skip=5,nrows=100,header=FALSE)
colnames(dataSB) <- varNames
dataSB$Measure <- rep("SB",100)

## Create tibbles with first six columns of each file
dataBWfirst6 <- dplyr::select(dataBW,c(Network:Tier6,Measure))
dataCLfirst6 <- dplyr::select(dataCL,c(Network:Tier6,Measure))
dataFBfirst6 <- dplyr::select(dataFB,c(Network:Tier6,Measure))
dataFTfirst6 <- dplyr::select(dataFT,c(Network:Tier6,Measure))
dataSBfirst6 <- dplyr::select(dataSB,c(Network:Tier6,Measure))
dataAllFirst6 <- bind_rows(dataBWfirst6,dataCLfirst6,dataFBfirst6,dataFTfirst6, dataSBfirst6)
dataAllFirst6$Measure <- as.factor(dataAllFirst6$Measure)

## Data in long format
tier6n50deg6pert6a <- gather(dataAllFirst6,key=CentLevel,value=Centrality,Cent1,Cent2,Cent3,Cent4,Cent5,Cent6,factor_key=TRUE) 
tier6n50deg6pert6b <- gather(dataAllFirst6,key=Level,value=Tier, Tier1,Tier2,Tier3,Tier4,Tier5,Tier6, factor_key=TRUE)
tier6n50deg6pert6 <-merge(tier6n50deg6pert6a,tier6n50deg6pert6b,by=c("Network","Measure"))
tier6n50deg6pert6 <- separate(tier6n50deg6pert6,Network,into=c("Trash","Sample"),sep=" ")
tier6n50deg6pert6$Sample <- as.integer(tier6n50deg6pert6$Sample)
tier6n50deg6pert6 <- tier6n50deg6pert6[,-1]

pdf(file="tier6n50deg6pert6Lines.pdf")
ggplot(tier6n50deg6pert6,aes(x=Sample, y=Centrality, colour=CentLevel)) + geom_line() + facet_grid(Measure~.,scales="free")
dev.off()
## quartz_off_screen 
##                 2
tier6n50deg6pert3$Perturbation <- "3%"
tier6n50deg6pert6$Perturbation <- "6%"
n50deg6all <- bind_rows(tier6n50deg6pert3,tier6n50deg6pert6)
n50deg6all$Perturbation <- as.factor(n50deg6all$Perturbation)

# Combined data frame
centSummary <- n50deg6all %>% group_by(Perturbation,CentLevel,Measure) %>% summarise(Mean=round(mean(Centrality),3),SD=round(sd(Centrality),3),CV=round(100*(sd(Centrality)/mean(Centrality)),3))

Hypothesis Testing for equality of variances

The Fligner-Kileen test is a test for homogeneity of variance in multiple independent samples. Here, we treat each measure of centrality as a different sample, and test to determine which measure is the most stable; in other words, has the lowest variance.

The modified FK test jointly ranks the absolute differences of each observation from the median observation for that group. It assigns increasing scores a quantile of the Normal distribution function, based on the ranks of all observations. Its test statistic has an asymptotic chi-square distribution with k-1 degrees of freedom, where k is the number of independent samples.

# Need to do this test by Levels
# Multiple testing correction?
flTest <- fligner_test(Centrality~Measure|CentLevel,data=tier6n50deg6pert3)
flTest
## 
##  Asymptotic K-Sample Fligner-Killeen Test
## 
## data:  Centrality by
##   Measure (BW, CL, FB, FT, SB) 
##   stratified by CentLevel
## chi-squared = 14566, df = 4, p-value < 2.2e-16
class(flTest)
## [1] "QuadTypeIndependenceTest"
## attr(,"package")
## [1] "coin"
## Approximative (Monte Carlo) Lepage test
## Hollander and Wolfe (1999, p. 172)
# lepage_trafo <- function(y) cbind("Location" = rank_trafo(y), "Scale" = ansari_trafo(y))

## Why was the null hypothesis rejected?
## Note: maximum statistic instead of quadratic form
ltm <- independence_test(Centrality ~ Measure, data = tier6n50deg6pert3,
                  teststat="quad", distribution = approximate(B = 10000),
                  alternative="two.sided", ytrafo=fligner_trafo)
ltm
## 
##  Approximative General Independence Test
## 
## data:  Centrality by Measure (BW, CL, FB, FT, SB)
## chi-squared = 15330, p-value < 2.2e-16
class(ltm)
## [1] "QuadTypeIndependenceTest"
## attr(,"package")
## [1] "coin"
class(tier6n50deg6pert3$Measure)
## [1] "factor"
pvalue(ltm, method="step-down")
## [1] 0
## 99 percent confidence interval:
##  0.0000000000 0.0005296914
pvalue(ltm, method="single-step")
## [1] 0
## 99 percent confidence interval:
##  0.0000000000 0.0005296914
pvalue(ltm, method="discrete")
## [1] 0
## 99 percent confidence interval:
##  0.0000000000 0.0005296914
# Plot the FKscores
tier6n50deg6pert3$FK <- fligner_trafo(tier6n50deg6pert3$Centrality)
ggplot(tier6n50deg6pert3,aes(x=Sample, y=FK, colour=Measure)) + geom_line() + facet_grid(CentLevel~.,scales="free")