source("http://www.math.jmu.edu/~garrenst/math300.dir/Rmacros") # Math 324 ci.binom.test <- function( x, p = 0.5, conf.level = 0.95 ) { # Produces an exact confidence interval on the 100*p_th percentile, # based on the binomial test, where ties are excluded. # `x' is the vector of observations. # `p' is the hypothesized probability of success. # `conf.level' is the confidence level for the returned confidence interval. delta <- (max(x)-min(x))/1e10 xgrid <- c(x, x+delta, x-delta) value.in.c.i. <- rep(NA, length(xgrid)) for (iii in 1:length(xgrid)) { x1 <- c( sum( xxgrid[iii] ) ); n <- sum(x1) value.in.c.i.[iii] <- binom.test(x1, n, p, alternative = "two.sided", conf.level)$p.value>=1-conf.level } c.i. <- c( min( xgrid[value.in.c.i.] ), max( xgrid[value.in.c.i.] ) ) c.i. } # Garren, 2005 # Math 324 power.binom.test <- function(n, alpha=0.05, alternative=c("two.sided", "less", "greater"), null.median, alt.pdist, ...) { # Computes the power of the binomial test (i.e., sign test). # This power function is based on binomial probabilities, # not the normal approximation to the binomial distribution, # and produces exact solutions. # n is the sample size. # alpha is the size of the test; i.e., P(type I error). # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # null.median is the population median under the null hypothesis. # `alt.pdist' is the cdf under the alternative distribution, and some choices are # pnorm, pexp, pcauchy, plaplace, pt, pchisq, pf, punif, pbinom, pgeo, ppois. # ...: optional arguments to `alt.pdist', EXCLUDING the first argument. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) if (alternative=="two.sided") { q <- qbinom(alpha/2, n, 0.5) ; q <- q - (pbinom(q,n,0.5)>alpha/2) prob <- pbinom( q, n, 1-match.fun(alt.pdist)(null.median, ...) ) + pbinom( q, n, match.fun(alt.pdist)(null.median, ...) ) } if (alternative=="less") { q <- qbinom(alpha, n, 0.5) ; q <- q - (pbinom(q,n,0.5)>alpha) prob <- pbinom( q, n, 1-match.fun(alt.pdist)(null.median, ...) ) } if (alternative=="greater") { q <- qbinom(alpha, n, 0.5) ; q <- q - (pbinom(q,n,0.5)>alpha) prob <- pbinom( q, n, match.fun(alt.pdist)(null.median, ...) ) } prob } # Garren, 2005 # Math 324 perm.test <- function(x, y=NULL, alternative=c("two.sided", "less", "greater"), mu=0, paired=FALSE, all.perms=TRUE, num.sim=1e4, plot=FALSE, stat=mean, ...) { # Performs one-sample and two-sample permutation tests on vectors of data. # The one-sample permutation test is based on stat(x-mu); # the two-sample paired permutation test is based on stat(x-y-mu); # and the two-sample unpaired permutation test is based on stat(x-mu)-stat(y). # `x' is a numeric vector of data values. # `y' is an optional numeric vector data values, and is used for two-sample tests only. # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # `mu' is a number indicating the null value of the location parameter # (or the difference in location parameters if performing a two-sample test). # `paired' indicates whether or not a two-sample test should be paired, # and is ignored for a one-sample test. # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE, # and is simulated when `all.perms' is FALSE or when # computing an exact p-value requires more than num.sim calculations. # `num.sim' is the upper limit on the number of permutations generated. # If `plot' is TRUE, then plot the histogram of the permutation distribution; # otherwise, list the p-value. # Setting `plot' to TRUE works best when the permutation distribution # is close to continuous. # Some reasonable choices for `stat' are mean and median. # ... consists of optional arguments to `stat'; # and is the second argument to `stat' when unspecified. # For example, if `stat' equals `mean', then the second argument # `trim' denotes the fraction (0 to 0.5) of observations to be trimmed # from each end of `x' and `y' before the mean is computed. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) if (!is.null(y) & !paired) { # Perform unpaired two-sample permutation test. max.loop <- 10; l1 <- min(length(x), length(y)); x <- x-mu l2 <- max(length(x), length(y)); l3 <- l1+l2 test.stat0 <- match.fun(stat)(x, ...)-match.fun(stat)(y, ...); test.stat <- NULL if (choose(l3,l1)>num.sim) all.perms <- FALSE output1 <- "Unpaired two-sample permutation test was performed." if (all.perms & l1<=max.loop) { output2 <- "Exact p-value was calculated." index <- rep(NA, max(l3, max.loop)) for (i1 in 1:(l2+1)) { for (i2 in (i1+1):max( i1+1, (l2+2)*(l1>=2) )) { for (i3 in (i2+1):max( i2+1, (l2+3)*(l1>=3) )) { for (i4 in (i3+1):max( i3+1, (l2+4)*(l1>=4) )) { for (i5 in (i4+1):max( i4+1, (l2+5)*(l1>=5) )) { for (i6 in (i5+1):max( i5+1, (l2+6)*(l1>=6) )) { for (i7 in (i6+1):max( i6+1, (l2+7)*(l1>=7) )) { for (i8 in (i7+1):max( i7+1, (l2+8)*(l1>=8) )) { for (i9 in (i8+1):max( i8+1, (l2+9)*(l1>=9) )) { for (i10 in (i9+1):max( i9+1, (l2+10)*(l1>=10) )) { index <- c(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10) index <- c( index[1:l1], setdiff(c(1:l3), index[1:l1]) ) test.stat <- c(test.stat, match.fun(stat)(c(x,y)[index[1:l1]], ...) -match.fun(stat)(c(x,y)[index[(l1+1):l3]], ...) ) }}}}}}}}}} # Number of parentheses is max.loop } # End of if else { # Use simulations. output2 <- paste("p-value was estimated based on", num.sim, "simulations.") for (i in 1:num.sim) { index <- sample(l3) test.stat <- c(test.stat, match.fun(stat)(c(x,y)[index[1:l1]], ...) -match.fun(stat)(c(x,y)[index[(l1+1):l3]], ...) ) } } if (length(x)>length(y)) test.stat <- -test.stat if (!plot) { if (alternative=="less") p.value <- mean( test.stat <= test.stat0 ) if (alternative=="greater") p.value <- mean( test.stat >= test.stat0 ) if (alternative=="two.sided") p.value <- mean( abs(test.stat) >= abs(test.stat0) ) structure(list(output1, output2, alternative=alternative, mu=mu, p.value=p.value)) } else {hist(test.stat, main="Histogram of Permutation Distribution", xlab="Value of test statistic", ylab="Frequency")} } # END of `if (!is.null(y) & !paired)' else { # Perform a one-sample or a paired two-sample permutation test on mu. max.loop <- 20 if (is.null(y)) { output1 <- "One-sample permutation test was performed." } else { x <- x-y; output1 <- "Paired permutation test was performed." } x <- x-mu; lx <- length(x); test.stat0 <- match.fun(stat)(x,...); test.stat <- NULL if (lx>max.loop || 2^lx>num.sim) all.perms <- FALSE if (all.perms) { output2 <- "p-value was calculated based on all permutations." for (i1 in 0:ifelse(lx<1,0,1)) { for (i2 in 0:ifelse(lx<2,0,1)) { for (i3 in 0:ifelse(lx<3,0,1)) { for (i4 in 0:ifelse(lx<4,0,1)) { for (i5 in 0:ifelse(lx<5,0,1)) { for (i6 in 0:ifelse(lx<6,0,1)) { for (i7 in 0:ifelse(lx<7,0,1)) { for (i8 in 0:ifelse(lx<8,0,1)) { for (i9 in 0:ifelse(lx<9,0,1)) { for (i10 in 0:ifelse(lx<10,0,1)) { for (i11 in 0:ifelse(lx<11,0,1)) { for (i12 in 0:ifelse(lx<12,0,1)) { for (i13 in 0:ifelse(lx<13,0,1)) { for (i14 in 0:ifelse(lx<14,0,1)) { for (i15 in 0:ifelse(lx<15,0,1)) { for (i16 in 0:ifelse(lx<16,0,1)) { for (i17 in 0:ifelse(lx<17,0,1)) { for (i18 in 0:ifelse(lx<18,0,1)) { for (i19 in 0:ifelse(lx<19,0,1)) { for (i20 in 0:ifelse(lx<20,0,1)) { test.stat <- c(test.stat, match.fun(mean)(x*(2*(c(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10, i11,i12,i13,i14,i15,i16,i17,i18,19,i20)[1:lx])-1),...)) }}}}}}}}}}}}}}}}}}}} # Number of parentheses is max.loop } # End of if else { # Use simulations. output2 <- paste("p-value was estimated based on", num.sim, "simulations.") for (i in 1:num.sim) test.stat <- c(test.stat, match.fun(stat)((x*sample(c(-1,1),lx,T)),...)) } if (!plot) { if (alternative=="less") { p.value <- mean( test.stat <= test.stat0 ) } if (alternative=="greater") { p.value <- mean( test.stat >= test.stat0 ) } if (alternative=="two.sided") { p.value <- mean( abs(test.stat) >= abs(test.stat0) ) } structure(list(output1, output2, alternative=alternative, mu=mu, p.value=p.value))} else {hist(test.stat, main="Histogram of Permutation Distribution", xlab="Value of test statistic", ylab="Frequency")} } # End of `else' } # End of perm.test Garren, 2005 # Math 324 score <- function(x, y=NULL, expon=FALSE) { # Generates van der Waerden scores (i.e., normal quantiles) or exponential # (similar to Savage) scores, for combined data `x' and `y'. # `x' is a positive integer equal to the number of desired scores when y is NULL, # or `x' is a vector of observations. # `y' is an optional vector of observations, typically used with two-sample tests. # When `expon' is FALSE, van der Waerden scores are computed, even for ties. # When `expon' is TRUE, Exponential scores are computed, # and interpolation is used for ties. # Output: scored values for `x', when y is NULL. # Output: `$x' are the scored values for `x', when y is not NULL. # Output: `$y' are the scored values for `y', when y is not NULL. if (is.null(y) & length(x)==1) { if (floor(x)==ceiling(x) & x>=1) {x <- 1:x} } n <- length(c(x,y)) ; if (!expon) {z <- qnorm(rank(c(x,y))/(n+1))} if (expon) { u <- cumsum( 1/(n+1-1:n) ) ; z <- u[ rank(c(x,y)) ] for (i in 1:n) { rank0 <- rank(c(x,y))[i] ; remainder <- rank0 - floor(rank0) if (remainder>0) {z[i] <- u[floor(rank0)]*(1-remainder)+u[ceiling(rank0)]*remainder}}} if (is.null(y)) { return(z) } if (!is.null(y)) { return(structure(list(x=z[1:length(x)], y=z[(length(x)+1):n]))) } } # Garren, 2005 # Math 324 siegel.test <- function(x, y, alternative=c("two.sided", "less", "greater"), reverse=FALSE, all.perms=TRUE, num.sim=1e4) { # Performs the Siegel-Tukey test, where ties are handled by averaging ranks, # not by asymptotic approximations. # `x' and `y' are numeric vectors of data values. # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # If `reverse' is FALSE, then assign rank 1 to the smallest observation. # If `reverse' is TRUE, then assign rank 1 to the largest observation. # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE, # and is simulated when `all.perms' is FALSE or when # computing an exact p-value requires more than num.sim calculations. # `num.sim' is the upper limit on the number of permutations generated. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) z <- sort(c(x,y)) ; lz <- length(z) lower <- 1; upper <- lz; j <- rep(NA, lz) for (i in 1:lz) { if (i-4*as.integer(i/4)<=1) { j[lower] <- i ; lower <- lower+1 } else {j[upper] <- i; upper <- upper-1} } if (reverse) { j <- rev(j) } z0 <- z[order(j)] rank.x.mat <- matrix(NA, length(x), lz) rank.y.mat <- matrix(NA, length(y), lz) for (k in 1:lz) { for (i in 1:length(x)) { if (x[i]==z0[k]) {rank.x.mat[i,k] <- k} } for (i in 1:length(y)) { if (y[i]==z0[k]) {rank.y.mat[i,k] <- k} } } rank.x <- apply(rank.x.mat,1,mean,na.rm=TRUE) rank.y <- apply(rank.y.mat,1,mean,na.rm=TRUE) if (alternative=="two.sided") {p.value <- perm.test(rank.x,rank.y,alternative="two.sided",all.perms=all.perms, num.sim=num.sim,stat=mean)$p.value} if (alternative=="less") {p.value <- perm.test(rank.x,rank.y,alternative="greater",all.perms=all.perms, num.sim=num.sim,stat=mean)$p.value} if (alternative=="greater") {p.value <- perm.test(rank.x,rank.y,alternative="less",all.perms=all.perms, num.sim=num.sim,stat=mean)$p.value} structure(list(" Siegel-Tukey test", alternative=alternative, rank.x=rank.x, rank.y=rank.y, p.value=p.value)) } # Garren, 2005 # Math 324 rmd.test <- function(x, y, alternative=c("two.sided", "less", "greater"), all.perms=TRUE, num.sim=1e4) { # A permutation test is performed based on the estimated rmd, # the ratio of the absolute mean deviances. # `x' and `y' are numeric vectors of data values. # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE, # and is simulated when `all.perms' is FALSE or when # computing an exact p-value requires more than num.sim calculations. # `num.sim' is the upper limit on the number of permutations generated. max.loop <- 10; l1 <- min(length(x), length(y)) ; tol <- 1e-10 alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) l2 <- max(length(x), length(y)); l3 <- l1+l2 x <- x-median(x) ; y <- y-median(y) rmd <- function(x, y, alternative) { test <- mean(abs(x)) / mean(abs(y)) if (alternative=="two.sided") {test <- max(test, 1/test)} return( test ) } test.stat0 <- rmd(x, y, alternative) ; test.stat <- NULL if (choose(l3,l1)>num.sim) all.perms <- FALSE if (all.perms & l1<=max.loop) { output <- "Exact p-value was calculated." index <- rep(NA, max(l3, max.loop)) for (i1 in 1:(l2+1)) { for (i2 in (i1+1):max( i1+1, (l2+2)*(l1>=2) )) { for (i3 in (i2+1):max( i2+1, (l2+3)*(l1>=3) )) { for (i4 in (i3+1):max( i3+1, (l2+4)*(l1>=4) )) { for (i5 in (i4+1):max( i4+1, (l2+5)*(l1>=5) )) { for (i6 in (i5+1):max( i5+1, (l2+6)*(l1>=6) )) { for (i7 in (i6+1):max( i6+1, (l2+7)*(l1>=7) )) { for (i8 in (i7+1):max( i7+1, (l2+8)*(l1>=8) )) { for (i9 in (i8+1):max( i8+1, (l2+9)*(l1>=9) )) { for (i10 in (i9+1):max( i9+1, (l2+10)*(l1>=10) )) { index <- c(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10) index <- c( index[1:l1], setdiff(c(1:l3), index[1:l1]) ) test.stat <- c(test.stat, rmd( c(x,y)[index[1:l1]], c(x,y)[index[(l1+1):l3]], alternative ) ) }}}}}}}}}} # Number of parentheses is max.loop } # End of if else { # Use simulations. output <- paste("p-value was estimated based on", num.sim, "simulations.") for (i in 1:num.sim) { index <- sample(l3) test.stat <- c(test.stat, rmd( c(x,y)[index[1:l1]], c(x,y)[index[(l1+1):l3]], alternative ) ) } } if (length(x)>length(y) & alternative!="two.sided") { test.stat <- 1 / test.stat } if (alternative=="less") { p.value <- mean( test.stat - test.stat0 <= tol ) } else { p.value <- mean( test.stat - test.stat0 >= -tol) } structure(list(output, alternative=alternative, rmd.hat=test.stat0, p.value=p.value)) } # End of rmd.test Garren, 2005 # Math 324 perm.f.test <- function(response, treatment=NULL, num.sim=1e4) { # A permutation F-test is performed, and a one-way analysis of variance F-test is performed. # Only approximated p-values are calculated for the permutation F-test. # `treatment' is the vector of treatments, and `response' is the vector of responses. # Alternatively, if `response' is NULL, then `treatment' must be an N by 2 matrix, # such that the first column represents treatment and the second column represents response. # `num.sim' is the number of permutations generated. # If `num.sim' is smaller than one, then the permutation test is not performed. # otherwise, list the p-value. if (is.null(treatment)) {treatment <- response[,2] ; response <- as.numeric(response[,1])} if (length(response)!=length(treatment)) stop("`response' and `treatment' must have the same length.") treatment <- factor(treatment) ; list1 <- c("One-way ANOVA", summary(aov(response~treatment))) if (num.sim>=1) { all.treatments <- union(treatment, NULL); k <- length(all.treatments); ni <- NULL for (i in 1:k) { ni <- c(ni, sum(treatment==all.treatments[i])) } one.way.anova <- function(treatment, response, all.treatments, ni, k) { # Returns $\sum_{i=1}^k n_i{\bar Y}_i^2$, which is monotone increasing with # the F-test statistic associated with one-way ANOVA; see Higgins, 2004, p. 84. yi.mean <- NULL for (i in 1:k) {yi.mean <- c(yi.mean, mean(response[treatment==all.treatments[i]])) } sum(ni*yi.mean^2) } test.stat0 <- one.way.anova(treatment, response, all.treatments, ni, k) ; test.stat <- NULL for (i in 1:num.sim) { test.stat <- c(test.stat, one.way.anova(treatment, sample(response), all.treatments, ni, k)) } list2 <- c( "The p-value from the permutation F-test based on", paste( num.sim, "simulations is", mean(test.stat>=test.stat0) ) ) list1 <- c(list1, list2) } return(list1) } # Garren, 2005 # Math 324 perm.cor.test <- function(x, y=NULL, alternative=c("two.sided", "less", "greater"), method=c("pearson", "spearman"), num.sim=1e4 ) { # A permutation correlation test is performed. # P-values are approximated based on randomly sampling the permutations. # `x' and `y' are the paired vectors of observations. # Alternatively, if `y' is NULL, then `x' must be an N by 2 matrix, # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # `method' is a character string specifying the type of correlation, and # must be one of `"pearson"' (default) or `"spearman"'. # Only the initial letter needs to be specified. # Choices for `method' are "pearson", "spearman", "kendall". # `num.sim' is the number of permutations generated. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) method <- abbreviation(method, c("pearson", "spearman")) if (is.null(y)) {y <- x[,2] ; x <- as.numeric(x[,1])} if (length(x)!=length(y)) stop("`x' and `y' must have the same length.") if (method=="spearman") {x <- rank(x); y <- rank(y)} if (method %in% c("pearson", "spearman")) { test.stat0 <- cor(x,y); test.stat <- NULL for (i in 1:num.sim) {test.stat <- c(test.stat, cor(x, sample(y)))} if (alternative=="two.sided") p.value <- mean(abs(test.stat)>=abs(test.stat0)) if (alternative=="less") p.value <- mean(test.stat<=test.stat0) if (alternative=="greater") p.value <- mean(test.stat>=test.stat0) } output1 <- paste("Permutation correlation test. Method is", method) output2 <- paste("p-value was estimated based on", num.sim, "simulations.") structure(list(output1, output2, alternative=alternative, p.value=p.value)) } # Garren, 2005 # Math 324 and 325 scan2 <- function(file.name, matrix.format=FALSE, character.rm=FALSE, disk.drive=NULL, course.num=324) { # Scans in a matrix or vector. # `file.name' is the name of the data set to be scanned. # If `file.name' is the complete path, then `disk.drive' is ignored. # Path is assumed to be complete if `file.name' does not begin with # `example', `exercise', `problem' or `table'. # If `matrix.format' is TRUE, then data are attempted to be saved as a rectangular matrix. # If `matrix.format' is FALSE, then data are saved as a vector. # If `character.rm' is TRUE, then remove all character (i.e., non-numeric) data. # `disk.drive' is the disk drive where the data are located, is NULL for web-based data, # and is "linux" for a disk drive on a linux-based computer. # Some choices for `disk.drive' are NULL, "linux", "a", "b", "c", "d", "e", .... # `course.num' is the course number and may be numeric or character. course.num <- as.numeric(course.num) if (is.null(disk.drive)) { full.name <- paste("http://www.math.jmu.edu/~garrenst/math", course.num, ".dir/datasets/", file.name, sep="") } else { if (disk.drive=="linux" & course.num==325) { full.name <- paste("/mnt/cdrom/SAS/DATASETS/", file.name, sep="") } if (disk.drive!="linux" & course.num==325) { full.name <- paste(disk.drive, ":/sas/datasets/", file.name, sep="") } } if ( is.na(charmatch("example", file.name)) & is.na(charmatch("exercise", file.name)) & is.na(charmatch("problem", file.name)) & is.na(charmatch("table", file.name)) ) full.name <- file.name data <- scan(full.name, what=character(), quiet=TRUE, comment.char="#") a.character <- FALSE ; options(warn = -1) if (course.num==324) { for (i in 1:length(data)) { if ( is.na(as.numeric(data[i])) ) {a.character <- TRUE} } } if (course.num==325) { for (i in 1:length(data)) { if ( is.na(as.numeric(data[i])) & data[i]!="." ) {a.character <- TRUE} if ( data[i]=="." ) data[i] <- NA } } # Missing data. options(warn = 0) if (a.character & character.rm) { options(warn = -1) ; data1 <- NULL for (i in 1:length(data)) { if ( !is.na(as.numeric(data[i])) || is.na(data[i]) ) data1 <- c(data1, data[i]) } options(warn = 0) ; data <- data1 } if (!a.character || character.rm) data <- as.numeric(data) if (matrix.format) { ncol <- 0 ; line.num <- 0 while (ncol==0 & line.num<=1000) { line.num <- line.num+1 first.line <- scan(full.name, what=character(), nlines=line.num, quiet=TRUE, comment.char="#") if (!a.character || !character.rm) {ncol <- length(first.line)} else { options(warn = -1) for (i in 1:length(first.line)) { if ( !is.na(as.numeric(first.line[i])) || is.na(first.line[i]) ) {ncol <- ncol+1} } options(warn = 0) } } if (ncol!=0) data <- matrix( data, ncol=ncol, byrow=TRUE ) } return(data) } # Garren, 2005