Okan OYMAK has not received any gifts yet

- Company:
- Turkish Land Forces Command

- Job Title:
- Director of Research

- Seniority:
- Consultant

- Job Function:
- Data Science, Machine Learning, AI, Business Analytics

- Industry:
- Military / Science

- Short Bio:
- Currently on active duty at HQ Turkish Land Forces Command as Operations Research Analyst.

Got my MS degree in Operations Analysis at the Naval Postgraduate School, Monterey, CA.

- Interests:
- Finding a new position

o.kruskal.test <- function(data, response, group, alpha=0.05){

# ------------------------------------------------------------------------------------------------------

# Author: Okan OYMAK

# MS in Operations Research at the Naval Postgraduate School, Monterey, CA, USA

# Date : 11 October 2016

# DESCRIPTION

# This function performs a Kruskal-Wallis rank sum test.

# Post-Hoc Test for pairwise comparisons is made automatically using Conover's procedure

# when the null hypothesis is rejected.

# ARGUMENTS

# data : A data frame.

# response : Column name for the response variable.

# group : Column name, which defines the grouping for each element of response.

# alpha : Significance level to test the hypothesis. Default is 0.05

# USAGE

# o.kruskal.test(data, response, group, alpha)

# e.g.

# o.kruskal.test(data=OrchardSprays, response="decrease", group="treatment")

# or simply

# o.kruskal.test(OrchardSprays, "decrease", "treatment")

# ASSUMPTIONS

# 1. All samples are random samples from their perpective populations.

# 2. In addition to independence within each sample, there is mutual

# independence among the various samples.

# 3. The measurement scale is alt least ordinal.

# 4. Either the k population distribution functions are identical,

# or else some of the populations tend to yield larger values

# than other populations do.

# HYPOTHESES

# Ho: All of the k population distribution functions are identical.

# Ha: At least one of the populations tend to yield larger observations

# than at least one of the other populations.

# or

# Ha: The k-populations do not all have identical means.

# ------------------------------------------------------------------------------------------------------

myList <- tapply(unlist(data[response]), unlist(data[group]), c)

n <- unlist(lapply(myList, length)) # Number of observations in each sample

N <- sum(n) # Total number of observations

k <- length(myList) # Number of random samples

ranks <- rank(unlist(myList))

from <- 1

for (i in 1:(k-1)) {

from[i+1] <- from[i] + n[i]

}

to <- cumsum(n)

R <- numeric()

for (i in 1:length(n)) {

R[i] <- sum(ranks[from[i]:to[i]])

}

df <- k - 1

cr <- qchisq(1 - alpha, df)

if (length(unique(ranks)) < length(ranks)) {

# This means that there are ties.

S2.1 <- (1/(N-1))*(sum(ranks^2)-(N*(N+1)^2)/4)

T.stat.1 <- (1/S2.1)*(sum((R^2)/n)-(N*(N+1)^2)/4)

p.value.1 <- 1 - pchisq(T.stat.1, df)

cat("Ties exist. Test statistic T=", round(T.stat.1, 4), "\n")

cat("Critical region of size", alpha, "for", df, "df", "\n")

cat("corresponds to all values of T greater than", round(cr, 3),"\n", "\n")

if(p.value.1 < alpha) {

t.stat <- qt(1 - alpha/2, N - k)

constant <- sqrt((S2.1 * (N - 1 - T.stat.1))/(N - k))

R.over.n <- R/n

one.over.n <- 1/n

cat("p.value = ", p.value.1, "<", "alpha = ", alpha, "\n")

cat("Therefore, reject the null hypothesis", "\n", "\n")

cat("INFERENCE: At least one of the populations tend to yield larger observations than at least one of the other populations.", "\n", "\n")

cat("Multiple comparisons are as follows", "\n")

combinations.names <- combn(sort(unique(unlist(data[group]))), 2)

combinations.index <- combn(1:k, 2)

for(i in 1:dim(combinations.index)[2]) {

test.1 <- abs(R.over.n[combinations.index[1,i]]-R.over.n[combinations.index[2,i]])

test.2 <- t.stat*constant*sqrt(one.over.n[combinations.index[1,i]]+one.over.n[combinations.index[2,i]])

cat("Populations:", paste(combinations.names[1,i]), "-", paste(combinations.names[2,i]), "\n")

cat("test.1=", test.1, "test.2=", test.2)

if (test.1 < test.2) {

cat(" No Difference", "\n", "\n")

}

else {

cat(" ***Different***", "\n", "\n")

}

}

}

else {

cat("p.value = ", p.value.1, ">", "alpha = ", alpha, "\n")

cat("Therefore, do not reject the null hypothesis", "\n")

cat("INFERENCE: All of the k population distribution functions are identical.", "\n")

}

} else if (length(unique(ranks)) == length(ranks)) {

# This means that there are no ties.

S2.0 <- N*(N+1)/12

T.stat.0 <- (1/S2.0)*sum((R^2)/n)-3*(N+1)

p.value.0 <- 1 - pchisq(T.stat.0, df)

cat("No ties exist. Test statistic T=", round(T.stat.0, 4), "\n")

cat("Critical region of size", alpha, "for", df, "df", "\n")

cat("corresponds to all values of T greater than", round(cr, 3),"\n", "\n")

if(p.value.0 < alpha) {

t.stat <- qt(1 - alpha/2, N - k)

constant <- sqrt((S2.0 * (N - 1 - T.stat.0))/(N - k))

R.over.n <- R/n

one.over.n <- 1/n

cat("p.value = ", p.value.0, "<", "alpha = ", alpha, "\n")

cat("Therefore, reject the null hypothesis", "\n", "\n")

cat("INFERENCE: At least one of the populations tend to yield larger observations than at least one of the other populations.", "\n", "\n")

cat("Multiple comparisons are as follows", "\n")

combinations.names <- combn(sort(unique(unlist(data[group]))), 2)

combinations.index <- combn(1:k, 2)

for(i in 1:dim(combinations.index)[2]) {

test.1 <- abs(R.over.n[combinations.index[1,i]]-R.over.n[combinations.index[2,i]])

test.2 <- t.stat*constant*sqrt(one.over.n[combinations.index[1,i]]+one.over.n[combinations.index[2,i]])

cat("Populations:", paste(combinations.names[1,i]), "-", paste(combinations.names[2,i]), "\n")

cat("test.1=", test.1, "test.2=", test.2)

if (test.1 < test.2) {

cat(" No Difference", "\n", "\n")

}

else {

cat(" ***Different***", "\n", "\n")

}

}

}

else {

cat("p.value = ", p.value.0, ">", "alpha = ", alpha, "\n")

cat("Therefore, do not reject the null hypothesis", "\n")

cat("INFERENCE: All of the k population distribution functions are identical.", "\n")

}

}

}

Posted on July 29, 2018 at 3:00am 0 Comments 1 Like

Below is an R code for Cox & Stuart Test for Trend Analysis. Simply, copy and paste the code into R workspace and use it. Unlike cox.stuart.test in R package named "randtests", this version of the test does not return a p-value greater than one. This phenomenon occurs when the test statistic, T is half of the number of untied pairs, N.

Here is a simple example that reveals the situtaion:

> x

[1] 1 4 6 7 9 7 1 6

> cox.stuart.test(x)

Cox Stuart…

ContinuePosted on July 24, 2018 at 8:00am 0 Comments 1 Like

Below is an R code for Friedman Test that includes post-hoc tests as well in case the null hypothesis is rejected.

Feel free to use the code after copying and pasting it into R workspace.

friedman.test <- function(data, alpha=0.05) {

#-----------------------------------------------------------------------------

# Author : Okan OYMAK, MS in Operations Research at the NPS Monterey, CA, USA

# Date : 12 March 2015

#

# Data:

# The…

© 2020 Data Science Central ® Powered by

Badges | Report an Issue | Privacy Policy | Terms of Service

**Upcoming DSC Webinar**

- How to Accelerate and Scale Your Data Science Workflows - June 11

Data scientists are faced with requests for information on a regular basis—from our colleagues, our bosses, or our clients. At the beginning of the analytic process, it's rare that requests are made or organized in a way that makes them easy to fulfill. It is our job to systematically unpack these requests and organize them in a way that allows us to take action and build analyses that accurately address the initial request—and to do it quickly, with the least amount of back-and-forth possible. In this latest DSC webinar, learn practical techniques to optimize your workflow, as well as the logic behind this methodology. Register today.

**Most Popular Content on DSC**

To not miss this type of content in the future, subscribe to our newsletter.

- Book: Statistics -- New Foundations, Toolbox, and Machine Learning Recipes
- Book: Classification and Regression In a Weekend - With Python
- Book: Applied Stochastic Processes
- Long-range Correlations in Time Series: Modeling, Testing, Case Study
- How to Automatically Determine the Number of Clusters in your Data
- New Machine Learning Cheat Sheet | Old one
- Confidence Intervals Without Pain - With Resampling
- Advanced Machine Learning with Basic Excel
- New Perspectives on Statistical Distributions and Deep Learning
- Fascinating New Results in the Theory of Randomness
- Fast Combinatorial Feature Selection

**Other popular resources**

- Comprehensive Repository of Data Science and ML Resources
- Statistical Concepts Explained in Simple English
- Machine Learning Concepts Explained in One Picture
- 100 Data Science Interview Questions and Answers
- Cheat Sheets | Curated Articles | Search | Jobs | Courses
- Post a Blog | Forum Questions | Books | Salaries | News

**Archives:** 2008-2014 |
2015-2016 |
2017-2019 |
Book 1 |
Book 2 |
More

**Upcoming DSC Webinar**

- How to Accelerate and Scale Your Data Science Workflows - June 11

Data scientists are faced with requests for information on a regular basis—from our colleagues, our bosses, or our clients. At the beginning of the analytic process, it's rare that requests are made or organized in a way that makes them easy to fulfill. It is our job to systematically unpack these requests and organize them in a way that allows us to take action and build analyses that accurately address the initial request—and to do it quickly, with the least amount of back-and-forth possible. In this latest DSC webinar, learn practical techniques to optimize your workflow, as well as the logic behind this methodology. Register today.

**Most popular articles**

- Free Book and Resources for DSC Members
- New Perspectives on Statistical Distributions and Deep Learning
- Time series, Growth Modeling and Data Science Wizardy
- Statistical Concepts Explained in Simple English
- Machine Learning Concepts Explained in One Picture
- Comprehensive Repository of Data Science and ML Resources
- Advanced Machine Learning with Basic Excel
- Difference between ML, Data Science, AI, Deep Learning, and Statistics
- Selected Business Analytics, Data Science and ML articles
- How to Automatically Determine the Number of Clusters in your Data
- Fascinating New Results in the Theory of Randomness
- Hire a Data Scientist | Search DSC | Find a Job
- Post a Blog | Forum Questions

## Comment Wall

## You need to be a member of Data Science Central to add comments!

Join Data Science Central