# R functions for Math 324, updated Spring 2016. source("http://educ.jmu.edu/~garrenst/math100.dir/Rmacros") # Math 324 scan2 = function(file.name, course.num=324, na.strings=".", ...) { # Reads in a table. # 'file.name' is the name of the data set to be scanned. # 'course.num' is the course number and may be numeric or character. # 'na.strings' is a vector of strings which are to be interpreted as `NA' values. # Blank fields are also considered to be missing values. # '...' are the optional arguments used in `read.table'. full.name <- paste("http://educ.jmu.edu/~garrenst/math", as.numeric(course.num), ".dir/datasets/", file.name, sep="") return(scan(full.name, na.strings=na.strings, comment.char="#", ...)) } # Garren16 # Math 324 read.table2 = function(file.name, course.num=324, na.strings=".", ...) { # Reads in a table. # 'file.name' is the name of the data set to be scanned. # 'course.num' is the course number and may be numeric or character. # 'na.strings' is a vector of strings which are to be interpreted as `NA' values. # Blank fields are also considered to be missing values. # '...' are the optional arguments used in `read.table'. full.name <- paste("http://educ.jmu.edu/~garrenst/math", as.numeric(course.num), ".dir/datasets/", file.name, sep="") return(read.table(full.name, na.strings=na.strings, ...)) } # Garren16 # Math 324 quantile.ci <- function( x, probs = 0.5, conf.level = 0.95 ) { # Produces exact confidence intervals on quantiles corresponding to the given probabilities, # based on the binomial test. # `x' is the numeric vector of observations. # `probs' is the numeric vector of cumulative probabilities. # `conf.level' is the confidence level for the returned confidence interval. c.i. = NULL for ( k in 1:length(probs) ) { delta <- (max(x)-min(x))/1e10 xgrid <- c(x, x+delta, x-delta) value.in.c.i. <- rep(NA, length(xgrid)) for (iii in 1:length(xgrid)) { x1 <- c( sum( xxgrid[iii] ) ); n <- sum(x1) value.in.c.i.[iii] <- binom.test(x1, n, probs[k], alternative = "two.sided", conf.level)$p.value>=1-conf.level } c.i. <- rbind( c.i., c( min( xgrid[value.in.c.i.] ), max( xgrid[value.in.c.i.] ) ) ) } colnames( c.i. ) = c("lower","upper") ; rownames( c.i. ) = probs return( c.i. ) } # Garren15 # Math 324 power.binom.test <- function(n, alpha=0.05, alternative=c("two.sided", "less", "greater"), null.median, alt.pdist, ...) { # Computes the power of the binomial test (i.e., sign test). # This power function is based on binomial probabilities, # not the normal approximation to the binomial distribution, # and produces exact solutions. # n is the sample size. # alpha is the size of the test; i.e., P(type I error). # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # null.median is the population median under the null hypothesis. # `alt.pdist' is the cdf under the alternative distribution, and some choices are # pnorm, pexp, pcauchy, plaplace, pt, pchisq, pf, punif, pbinom, pgeo, ppois. # ...: optional arguments to `alt.pdist', EXCLUDING the first argument. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) if (alternative=="two.sided") { q <- qbinom(alpha/2, n, 0.5) ; q <- q - (pbinom(q,n,0.5)>alpha/2) prob <- pbinom( q, n, 1-match.fun(alt.pdist)(null.median, ...) ) + pbinom( q, n, match.fun(alt.pdist)(null.median, ...) ) } if (alternative=="less") { q <- qbinom(alpha, n, 0.5) ; q <- q - (pbinom(q,n,0.5)>alpha) prob <- pbinom( q, n, 1-match.fun(alt.pdist)(null.median, ...) ) } if (alternative=="greater") { q <- qbinom(alpha, n, 0.5) ; q <- q - (pbinom(q,n,0.5)>alpha) prob <- pbinom( q, n, match.fun(alt.pdist)(null.median, ...) ) } prob } # Garren15 # Math 324 perm.test <- function(x, y=NULL, alternative=c("two.sided", "less", "greater"), mu=0, paired=FALSE, all.perms=TRUE, num.sim=2e4, plot=FALSE, stat=mean, ...) { # If this version does not work on Rweb, then use "perm.test.old" instead. # Performs one-sample and two-sample permutation tests on vectors of data. # The one-sample permutation test is based on stat(x-mu); # the two-sample paired permutation test is based on stat(x-y-mu); # and the two-sample unpaired permutation test is based on stat(x-mu)-stat(y). # `x' is a numeric vector of data values. # `y' is an optional numeric vector data values, and is used for two-sample tests only. # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # `mu' is a number indicating the null value of the location parameter # (or the difference in location parameters if performing a two-sample test). # `paired' indicates whether or not a two-sample test should be paired, # and is ignored for a one-sample test. # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE, # and is simulated when `all.perms' is FALSE or when # computing an exact p-value requires more than num.sim calculations. # `num.sim' is the upper limit on the number of permutations generated. # If `plot' is TRUE, then plot the histogram of the permutation distribution; # otherwise, list the p-value. # Setting `plot' to TRUE works best when the permutation distribution # is close to continuous. # Some reasonable choices for `stat' are mean and median. # ... consists of optional arguments to `stat'; # and is the second argument to `stat' when unspecified. # For example, if `stat' equals `mean', then the second argument # `trim' denotes the fraction (0 to 0.5) of observations to be trimmed # from each end of `x' and `y' before the mean is computed. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) if (!is.null(y) & !paired) { # Perform unpaired two-sample permutation test. l1 <- min(length(x), length(y)); x <- x-mu l2 <- max(length(x), length(y)); l3 <- l1+l2 test.stat0 <- match.fun(stat)(x, ...)-match.fun(stat)(y, ...); test.stat <- NULL if (choose(l3,l1)>num.sim) all.perms <- FALSE output1 <- "Unpaired two-sample permutation test was performed." if (all.perms) { output2 <- "Exact p-value was calculated." combos <- combn( 1:l3, l1 ) for (i in 1:dim(combos)[2]) { index <- c( combos[,i], setdiff( c(1:l3), combos[,i] ) ) test.stat <- c(test.stat, match.fun(stat)(c(x,y)[index[1:l1]], ...) -match.fun(stat)(c(x,y)[index[(l1+1):l3]], ...) ) } } # End of if else { # Use simulations. output2 <- paste("p-value was estimated based on", num.sim, "simulations.") for (i in 1:num.sim) { index <- sample(l3) test.stat <- c(test.stat, match.fun(stat)(c(x,y)[index[1:l1]], ...) -match.fun(stat)(c(x,y)[index[(l1+1):l3]], ...) ) } } if (length(x)>length(y)) test.stat <- -test.stat if (!plot) { if (alternative=="less") p.value <- mean( test.stat <= test.stat0 ) if (alternative=="greater") p.value <- mean( test.stat >= test.stat0 ) if (alternative=="two.sided") p.value <- mean( abs(test.stat) >= abs(test.stat0) ) structure(list(output1, output2, alternative=alternative, mu=mu, p.value=p.value)) } else {hist(test.stat, main="Histogram of Permutation Distribution", xlab="Value of test statistic", ylab="Frequency")} } # END of `if (!is.null(y) & !paired)' else { # Perform a one-sample or a paired two-sample permutation test on mu. if (is.null(y)) { output1 <- "One-sample permutation test was performed." } else { x <- x-y; output1 <- "Paired permutation test was performed." } x <- x-mu; lx <- length(x); test.stat0 <- match.fun(stat)(x,...); test.stat <- NULL if (2^lx>num.sim) all.perms <- FALSE if (all.perms) { output2 <- "p-value was calculated based on all permutations." plus.minus.matrix <- matrix(1, 1, lx) for (i in lx:1) { temp <- plus.minus.matrix ; temp[,i] <- -1 plus.minus.matrix <- rbind( plus.minus.matrix, temp ) } for (ii in 1:2^lx) { test.stat <- c(test.stat, match.fun(mean)(x*plus.minus.matrix[ii,],...)) } } # End of if else { # Use simulations. output2 <- paste("p-value was estimated based on", num.sim, "simulations.") for (i in 1:num.sim) test.stat <- c(test.stat, match.fun(stat)((x*sample(c(-1,1),lx,T)),...)) } if (!plot) { if (alternative=="less") { p.value <- mean( test.stat <= test.stat0 ) } if (alternative=="greater") { p.value <- mean( test.stat >= test.stat0 ) } if (alternative=="two.sided") { p.value <- mean( abs(test.stat) >= abs(test.stat0) ) } structure(list(output1, output2, alternative=alternative, mu=mu, p.value=p.value))} else {hist(test.stat, main="Histogram of Permutation Distribution", xlab="Value of test statistic", ylab="Frequency")} } # End of `else' } # End of perm.test Garren15 # Math 324 score <- function(x, y=NULL, expon=FALSE) { # Generates van der Waerden scores (i.e., normal quantiles) or exponential # (similar to Savage) scores, for combined data `x' and `y'. # `x' is a positive integer equal to the number of desired scores when y is NULL, # or `x' is a vector of observations. # `y' is an optional vector of observations, typically used with two-sample tests. # When `expon' is FALSE, van der Waerden scores are computed, even for ties. # When `expon' is TRUE, Exponential scores are computed, # and interpolation is used for ties. # Output: scored values for `x', when y is NULL. # Output: `$x' are the scored values for `x', when y is not NULL. # Output: `$y' are the scored values for `y', when y is not NULL. if (is.null(y) & length(x)==1) { if (floor(x)==ceiling(x) & x>=1) {x <- 1:x} } n <- length(c(x,y)) ; if (!expon) {z <- qnorm(rank(c(x,y))/(n+1))} if (expon) { u <- cumsum( 1/(n+1-1:n) ) ; z <- u[ rank(c(x,y)) ] for (i in 1:n) { rank0 <- rank(c(x,y))[i] ; remainder <- rank0 - floor(rank0) if (remainder>0) {z[i] <- u[floor(rank0)]*(1-remainder)+u[ceiling(rank0)]*remainder}}} if (is.null(y)) { return(z) } if (!is.null(y)) { return(structure(list(x=z[1:length(x)], y=z[(length(x)+1):n]))) } } # Garren15 # Math 324 siegel.test <- function(x, y, alternative=c("two.sided", "less", "greater"), reverse=FALSE, all.perms=TRUE, num.sim=2e4) { # Performs the Siegel-Tukey test, where ties are handled by averaging ranks, # not by asymptotic approximations. # `x' and `y' are numeric vectors of data values. # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # If `reverse' is FALSE, then assign rank 1 to the smallest observation. # If `reverse' is TRUE, then assign rank 1 to the largest observation. # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE, # and is simulated when `all.perms' is FALSE or when # computing an exact p-value requires more than num.sim calculations. # `num.sim' is the upper limit on the number of permutations generated. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) z <- sort(c(x,y)) ; lz <- length(z) lower <- 1; upper <- lz; j <- rep(NA, lz) for (i in 1:lz) { if (i-4*as.integer(i/4)<=1) { j[lower] <- i ; lower <- lower+1 } else {j[upper] <- i; upper <- upper-1} } if (reverse) { j <- rev(j) } z0 <- z[order(j)] rank.x.mat <- matrix(NA, length(x), lz) rank.y.mat <- matrix(NA, length(y), lz) for (k in 1:lz) { for (i in 1:length(x)) { if (x[i]==z0[k]) {rank.x.mat[i,k] <- k} } for (i in 1:length(y)) { if (y[i]==z0[k]) {rank.y.mat[i,k] <- k} } } rank.x <- apply(rank.x.mat,1,mean,na.rm=TRUE) rank.y <- apply(rank.y.mat,1,mean,na.rm=TRUE) if (alternative=="two.sided") {p.value <- perm.test(rank.x,rank.y,alternative="two.sided",all.perms=all.perms, num.sim=num.sim,stat=mean)$p.value} if (alternative=="less") {p.value <- perm.test(rank.x,rank.y,alternative="greater",all.perms=all.perms, num.sim=num.sim,stat=mean)$p.value} if (alternative=="greater") {p.value <- perm.test(rank.x,rank.y,alternative="less",all.perms=all.perms, num.sim=num.sim,stat=mean)$p.value} structure(list(" Siegel-Tukey test", alternative=alternative, rank.x=rank.x, rank.y=rank.y, p.value=p.value)) } # Garren15 # Math 324 rmd.test <- function(x, y, alternative=c("two.sided", "less", "greater"), all.perms=TRUE, num.sim=2e4) { # A permutation test is performed based on the estimated rmd, # the ratio of the absolute mean deviances. # `x' and `y' are numeric vectors of data values. # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE, # and is simulated when `all.perms' is FALSE or when # computing an exact p-value requires more than num.sim calculations. # `num.sim' is the upper limit on the number of permutations generated. max.loop <- 10; l1 <- min(length(x), length(y)) ; tol <- 1e-10 alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) l2 <- max(length(x), length(y)); l3 <- l1+l2 x <- x-median(x) ; y <- y-median(y) rmd <- function(x, y, alternative) { test <- mean(abs(x)) / mean(abs(y)) if (alternative=="two.sided") {test <- max(test, 1/test)} return( test ) } test.stat0 <- rmd(x, y, alternative) ; test.stat <- NULL if (choose(l3,l1)>num.sim) all.perms <- FALSE if (all.perms & l1<=max.loop) { output <- "Exact p-value was calculated." index <- rep(NA, max(l3, max.loop)) for (i1 in 1:(l2+1)) { for (i2 in (i1+1):max( i1+1, (l2+2)*(l1>=2) )) { for (i3 in (i2+1):max( i2+1, (l2+3)*(l1>=3) )) { for (i4 in (i3+1):max( i3+1, (l2+4)*(l1>=4) )) { for (i5 in (i4+1):max( i4+1, (l2+5)*(l1>=5) )) { for (i6 in (i5+1):max( i5+1, (l2+6)*(l1>=6) )) { for (i7 in (i6+1):max( i6+1, (l2+7)*(l1>=7) )) { for (i8 in (i7+1):max( i7+1, (l2+8)*(l1>=8) )) { for (i9 in (i8+1):max( i8+1, (l2+9)*(l1>=9) )) { for (i10 in (i9+1):max( i9+1, (l2+10)*(l1>=10) )) { index <- c(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10) index <- c( index[1:l1], setdiff(c(1:l3), index[1:l1]) ) test.stat <- c(test.stat, rmd( c(x,y)[index[1:l1]], c(x,y)[index[(l1+1):l3]], alternative ) ) }}}}}}}}}} # Number of parentheses is max.loop } # End of if else { # Use simulations. output <- paste("p-value was estimated based on", num.sim, "simulations.") for (i in 1:num.sim) { index <- sample(l3) test.stat <- c(test.stat, rmd( c(x,y)[index[1:l1]], c(x,y)[index[(l1+1):l3]], alternative ) ) } } if (length(x)>length(y) & alternative!="two.sided") { test.stat <- 1 / test.stat } if (alternative=="less") { p.value <- mean( test.stat - test.stat0 <= tol ) } else { p.value <- mean( test.stat - test.stat0 >= -tol) } structure(list(output, alternative=alternative, rmd.hat=test.stat0, p.value=p.value)) } # End of rmd.test Garren15 # Math 324 perm.f.test <- function(response, treatment=NULL, num.sim=2e4) { # A permutation F-test is performed, and a one-way analysis of variance F-test is performed. # Only approximated p-values are calculated for the permutation F-test. # `treatment' is the vector of treatments, and `response' is the vector of responses. # Alternatively, if `treatment' is NULL, then `response' must be an N by 2 matrix, # such that the first column represents response and the second column represents treatment. # `num.sim' is the number of permutations generated. # If `num.sim' is smaller than one, then the permutation test is not performed; # otherwise, list the p-value. if (is.null(treatment)) {treatment <- response[,2] ; response <- as.numeric(response[,1])} if (length(response)!=length(treatment)) stop("`response' and `treatment' must have the same length.") treatment <- factor(treatment) ; list1 <- c("One-way ANOVA", summary(aov(response~treatment))) if (num.sim>=1) { all.treatments <- union(treatment, NULL); k <- length(all.treatments); ni <- NULL for (i in 1:k) { ni <- c(ni, sum(treatment==all.treatments[i])) } one.way.anova <- function(treatment, response, all.treatments, ni, k) { # Returns $\sum_{i=1}^k n_i{\bar Y}_i^2$, which is monotone increasing with # the F-test statistic associated with one-way ANOVA; see Higgins, 2004, p. 84. yi.mean <- NULL for (i in 1:k) {yi.mean <- c(yi.mean, mean(response[treatment==all.treatments[i]])) } sum(ni*yi.mean^2) } test.stat0 <- one.way.anova(treatment, response, all.treatments, ni, k) ; test.stat <- NULL for (i in 1:num.sim) { test.stat <- c(test.stat, one.way.anova(treatment, sample(response), all.treatments, ni, k)) } list2 <- c( "The p-value from the permutation F-test based on", paste( num.sim, "simulations is", mean(test.stat>=test.stat0) ) ) list1 <- c(list1, list2) } return(list1) } # Garren15 # Math 324 perm.cor.test <- function(x, y=NULL, alternative=c("two.sided", "less", "greater"), method=c("pearson", "spearman"), num.sim=2e4 ) { # A permutation correlation test is performed. # P-values are approximated based on randomly sampling the permutations. # `x' and `y' are the paired vectors of observations. # Alternatively, if `y' is NULL, then `x' must be an N by 2 matrix, # `alternative' is a character string specifying the alternative hypothesis, and # must be one of `"two.sided"' (default), `"greater"' or `"less"'. # Only the initial letter needs to be specified. # `method' is a character string specifying the type of correlation, and # must be one of `"pearson"' (default) or `"spearman"'. # Only the initial letter needs to be specified. # Choices for `method' are "pearson", "spearman", "kendall". # `num.sim' is the number of permutations generated. alternative <- abbreviation(alternative, c("two.sided", "less", "greater")) method <- abbreviation(method, c("pearson", "spearman")) if (is.null(y)) {y <- x[,2] ; x <- as.numeric(x[,1])} if (length(x)!=length(y)) stop("`x' and `y' must have the same length.") if (method=="spearman") {x <- rank(x); y <- rank(y)} if (method %in% c("pearson", "spearman")) { test.stat0 <- cor(x,y); test.stat <- NULL for (i in 1:num.sim) {test.stat <- c(test.stat, cor(x, sample(y)))} if (alternative=="two.sided") p.value <- mean(abs(test.stat)>=abs(test.stat0)) if (alternative=="less") p.value <- mean(test.stat<=test.stat0) if (alternative=="greater") p.value <- mean(test.stat>=test.stat0) } output1 <- paste("Permutation correlation test. Method is", method) output2 <- paste("p-value was estimated based on", num.sim, "simulations.") structure(list(output1, output2, alternative=alternative, p.value=p.value)) } # Garren15 # Consider installing packages "fastGraph" and "jmuOutlier". the.answer <- readline("\nDo you want to install the packages\n'fastGraph' and 'jmuOutlier'? (y or n) ") if ( substring(the.answer,1,1) %in% c("y","Y") ) {cat("\n"); install.packages( c("fastGraph","jmuOutlier"), quiet=TRUE )} if ( suppressWarnings( library( fastGraph, logical.return=TRUE, warn.conflicts=FALSE ) ) ) { library( fastGraph, warn.conflicts=FALSE ) suppressWarnings( try( rm( plotDist, shadeDist, shadePhat, plotLine, getMinMax ), silent=TRUE ) ) } if ( suppressWarnings( library( jmuOutlier, logical.return=TRUE, warn.conflicts=FALSE ) ) ) { library( jmuOutlier, warn.conflicts=FALSE ) suppressWarnings( try( rm( perm.cor.test, perm.f.test, perm.test, rmd.test, siegel.test ), silent=TRUE ) ) suppressWarnings( try( rm( CI.t.test, quantileCI ), silent=TRUE ) ) suppressWarnings( try( rm( fourier, lineGraph, plotCI, plotEcdf, plotVector, truncHist ), silent=TRUE ) ) suppressWarnings( try( rm( dlaplace, plaplace, qlaplace, rlaplace, dtriang, ptriang, qtriang, rtriang ), silent=TRUE ) ) suppressWarnings( try( rm( abbreviation, power.binom.test, score ), silent=TRUE ) ) } if ( !suppressWarnings( library( fastGraph, logical.return=TRUE, warn.conflicts=FALSE ) ) ) cat("\nThe package 'fastGraph' is not installed on your computer.\n") if ( !suppressWarnings( library( jmuOutlier, logical.return=TRUE, warn.conflicts=FALSE ) ) ) cat("\nThe package 'jmuOutlier' is not installed on your computer.\n") cat("\nHelp is available via:\n") if ( suppressWarnings( library( fastGraph, logical.return=TRUE, warn.conflicts=FALSE ) ) ) cat("> ?fastGraph\n") if ( suppressWarnings( library( jmuOutlier, logical.return=TRUE, warn.conflicts=FALSE ) ) ) cat("> ?jmuOutlier\n") cat("> scan2\n> read.table2\n> ls(1)\n> ls(2) # etc.\n\n") suppressWarnings( try( rm( the.answer ), silent=TRUE ) )