Okan OYMAK has not received any gifts yet
o.kruskal.test <- function(data, response, group, alpha=0.05){
# ------------------------------------------------------------------------------------------------------
# Author: Okan OYMAK
# MS in Operations Research at the Naval Postgraduate School, Monterey, CA, USA
# Date : 11 October 2016
# DESCRIPTION
# This function performs a Kruskal-Wallis rank sum test.
# Post-Hoc Test for pairwise comparisons is made automatically using Conover's procedure
# when the null hypothesis is rejected.
# ARGUMENTS
# data : A data frame.
# response : Column name for the response variable.
# group : Column name, which defines the grouping for each element of response.
# alpha : Significance level to test the hypothesis. Default is 0.05
# USAGE
# o.kruskal.test(data, response, group, alpha)
# e.g.
# o.kruskal.test(data=OrchardSprays, response="decrease", group="treatment")
# or simply
# o.kruskal.test(OrchardSprays, "decrease", "treatment")
# ASSUMPTIONS
# 1. All samples are random samples from their perpective populations.
# 2. In addition to independence within each sample, there is mutual
# independence among the various samples.
# 3. The measurement scale is alt least ordinal.
# 4. Either the k population distribution functions are identical,
# or else some of the populations tend to yield larger values
# than other populations do.
# HYPOTHESES
# Ho: All of the k population distribution functions are identical.
# Ha: At least one of the populations tend to yield larger observations
# than at least one of the other populations.
# or
# Ha: The k-populations do not all have identical means.
# ------------------------------------------------------------------------------------------------------
myList <- tapply(unlist(data[response]), unlist(data[group]), c)
n <- unlist(lapply(myList, length)) # Number of observations in each sample
N <- sum(n) # Total number of observations
k <- length(myList) # Number of random samples
ranks <- rank(unlist(myList))
from <- 1
for (i in 1:(k-1)) {
from[i+1] <- from[i] + n[i]
}
to <- cumsum(n)
R <- numeric()
for (i in 1:length(n)) {
R[i] <- sum(ranks[from[i]:to[i]])
}
df <- k - 1
cr <- qchisq(1 - alpha, df)
if (length(unique(ranks)) < length(ranks)) {
# This means that there are ties.
S2.1 <- (1/(N-1))*(sum(ranks^2)-(N*(N+1)^2)/4)
T.stat.1 <- (1/S2.1)*(sum((R^2)/n)-(N*(N+1)^2)/4)
p.value.1 <- 1 - pchisq(T.stat.1, df)
cat("Ties exist. Test statistic T=", round(T.stat.1, 4), "\n")
cat("Critical region of size", alpha, "for", df, "df", "\n")
cat("corresponds to all values of T greater than", round(cr, 3),"\n", "\n")
if(p.value.1 < alpha) {
t.stat <- qt(1 - alpha/2, N - k)
constant <- sqrt((S2.1 * (N - 1 - T.stat.1))/(N - k))
R.over.n <- R/n
one.over.n <- 1/n
cat("p.value = ", p.value.1, "<", "alpha = ", alpha, "\n")
cat("Therefore, reject the null hypothesis", "\n", "\n")
cat("INFERENCE: At least one of the populations tend to yield larger observations than at least one of the other populations.", "\n", "\n")
cat("Multiple comparisons are as follows", "\n")
combinations.names <- combn(sort(unique(unlist(data[group]))), 2)
combinations.index <- combn(1:k, 2)
for(i in 1:dim(combinations.index)[2]) {
test.1 <- abs(R.over.n[combinations.index[1,i]]-R.over.n[combinations.index[2,i]])
test.2 <- t.stat*constant*sqrt(one.over.n[combinations.index[1,i]]+one.over.n[combinations.index[2,i]])
cat("Populations:", paste(combinations.names[1,i]), "-", paste(combinations.names[2,i]), "\n")
cat("test.1=", test.1, "test.2=", test.2)
if (test.1 < test.2) {
cat(" No Difference", "\n", "\n")
}
else {
cat(" ***Different***", "\n", "\n")
}
}
}
else {
cat("p.value = ", p.value.1, ">", "alpha = ", alpha, "\n")
cat("Therefore, do not reject the null hypothesis", "\n")
cat("INFERENCE: All of the k population distribution functions are identical.", "\n")
}
} else if (length(unique(ranks)) == length(ranks)) {
# This means that there are no ties.
S2.0 <- N*(N+1)/12
T.stat.0 <- (1/S2.0)*sum((R^2)/n)-3*(N+1)
p.value.0 <- 1 - pchisq(T.stat.0, df)
cat("No ties exist. Test statistic T=", round(T.stat.0, 4), "\n")
cat("Critical region of size", alpha, "for", df, "df", "\n")
cat("corresponds to all values of T greater than", round(cr, 3),"\n", "\n")
if(p.value.0 < alpha) {
t.stat <- qt(1 - alpha/2, N - k)
constant <- sqrt((S2.0 * (N - 1 - T.stat.0))/(N - k))
R.over.n <- R/n
one.over.n <- 1/n
cat("p.value = ", p.value.0, "<", "alpha = ", alpha, "\n")
cat("Therefore, reject the null hypothesis", "\n", "\n")
cat("INFERENCE: At least one of the populations tend to yield larger observations than at least one of the other populations.", "\n", "\n")
cat("Multiple comparisons are as follows", "\n")
combinations.names <- combn(sort(unique(unlist(data[group]))), 2)
combinations.index <- combn(1:k, 2)
for(i in 1:dim(combinations.index)[2]) {
test.1 <- abs(R.over.n[combinations.index[1,i]]-R.over.n[combinations.index[2,i]])
test.2 <- t.stat*constant*sqrt(one.over.n[combinations.index[1,i]]+one.over.n[combinations.index[2,i]])
cat("Populations:", paste(combinations.names[1,i]), "-", paste(combinations.names[2,i]), "\n")
cat("test.1=", test.1, "test.2=", test.2)
if (test.1 < test.2) {
cat(" No Difference", "\n", "\n")
}
else {
cat(" ***Different***", "\n", "\n")
}
}
}
else {
cat("p.value = ", p.value.0, ">", "alpha = ", alpha, "\n")
cat("Therefore, do not reject the null hypothesis", "\n")
cat("INFERENCE: All of the k population distribution functions are identical.", "\n")
}
}
}
Â© 2018 Data Science Centralâ„¢ Powered by
Badges | Report an Issue | Privacy Policy | Terms of Service