# R functions for Math 324, updated Spring 2016.

source("http://educ.jmu.edu/~garrenst/math100.dir/Rmacros")

# Math 324
scan2 = function(file.name, course.num=324, na.strings=".", ...)  {
  # Reads in a table.
  # 'file.name' is the name of the data set to be scanned.
  # 'course.num' is the course number and may be numeric or character.
  # 'na.strings' is a vector of strings which are to be interpreted as `NA' values.
  # Blank fields are also considered to be missing values.
  # '...' are the optional arguments used in `read.table'.
  full.name <- paste("http://educ.jmu.edu/~garrenst/math", as.numeric(course.num),
                     ".dir/datasets/", file.name, sep="")
  return(scan(full.name, na.strings=na.strings, comment.char="#", ...))
} # Garren16


# Math 324
read.table2 = function(file.name, course.num=324, na.strings=".", ...)  {
  # Reads in a table.
  # 'file.name' is the name of the data set to be scanned.
  # 'course.num' is the course number and may be numeric or character.
  # 'na.strings' is a vector of strings which are to be interpreted as `NA' values.
  # Blank fields are also considered to be missing values.
  # '...' are the optional arguments used in `read.table'.
  full.name <- paste("http://educ.jmu.edu/~garrenst/math", as.numeric(course.num),
                     ".dir/datasets/", file.name, sep="")
  return(read.table(full.name, na.strings=na.strings, ...))
} # Garren16


# Math 324
quantile.ci <- function( x, probs = 0.5, conf.level = 0.95 ) { 
  # Produces exact confidence intervals on quantiles corresponding to the given probabilities,
  #    based on the binomial test. 
  # `x' is the numeric vector of observations.
  # `probs' is the numeric vector of cumulative probabilities.
  # `conf.level' is the confidence level for the returned confidence interval.
  c.i. = NULL
  for ( k in 1:length(probs) )   {
     delta <- (max(x)-min(x))/1e10
     xgrid <- c(x, x+delta, x-delta)
     value.in.c.i. <- rep(NA, length(xgrid))
     for (iii in 1:length(xgrid)) {
        x1 <- c( sum( x<xgrid[iii] ), sum( x>xgrid[iii] ) ); n <- sum(x1)
        value.in.c.i.[iii] <-
           binom.test(x1, n, probs[k], alternative = "two.sided", conf.level)$p.value>=1-conf.level
     }
     c.i. <- rbind( c.i.,  c( min( xgrid[value.in.c.i.] ), max( xgrid[value.in.c.i.] ) ) )
  }
  colnames( c.i. ) = c("lower","upper") ;  rownames( c.i. ) = probs 
  return( c.i. )
} # Garren15


# Math 324
power.binom.test <- function(n, alpha=0.05, alternative=c("two.sided", "less", "greater"), 
                                null.median, alt.pdist, ...) { 
  # Computes the power of the binomial test (i.e., sign test). 
  # This power function is based on binomial probabilities,
  #    not the normal approximation to the binomial distribution,
  #    and produces exact solutions.
  # n is the sample size.
  # alpha is the size of the test; i.e., P(type I error).
  # `alternative' is a character string specifying the alternative hypothesis, and
  #   must be one of `"two.sided"' (default), `"greater"' or `"less"'.  
  #   Only the initial letter needs to be specified.
  # null.median is the population median under the null hypothesis.
  # `alt.pdist' is the cdf under the alternative distribution, and some choices are
  #    pnorm, pexp, pcauchy, plaplace, pt, pchisq, pf, punif, pbinom, pgeo, ppois.
  # ...: optional arguments to `alt.pdist', EXCLUDING the first argument.
  alternative <- abbreviation(alternative, c("two.sided", "less", "greater"))
  if (alternative=="two.sided") {
     q <- qbinom(alpha/2, n, 0.5) ;    q <- q - (pbinom(q,n,0.5)>alpha/2)
     prob <- pbinom( q, n, 1-match.fun(alt.pdist)(null.median, ...) )  +
             pbinom( q, n, match.fun(alt.pdist)(null.median, ...) ) }
  if (alternative=="less") {
     q <- qbinom(alpha, n, 0.5) ;    q <- q - (pbinom(q,n,0.5)>alpha)
     prob <- pbinom( q, n, 1-match.fun(alt.pdist)(null.median, ...) ) }
  if (alternative=="greater") {
     q <- qbinom(alpha, n, 0.5) ;    q <- q - (pbinom(q,n,0.5)>alpha)
     prob <- pbinom( q, n, match.fun(alt.pdist)(null.median, ...) ) }
  prob
} # Garren15


# Math 324
perm.test <- function(x, y=NULL, alternative=c("two.sided", "less", "greater"), mu=0,
                      paired=FALSE, all.perms=TRUE, num.sim=2e4, plot=FALSE, stat=mean, ...) {
  # If this version does not work on Rweb, then use "perm.test.old" instead.
  # Performs one-sample and two-sample permutation tests on vectors of data.
  # The one-sample permutation test is based on stat(x-mu);
  #   the two-sample paired permutation test is based on stat(x-y-mu);
  #   and the two-sample unpaired permutation test is based on stat(x-mu)-stat(y).
  # `x' is a numeric vector of data values.
  # `y' is an optional numeric vector data values, and is used for two-sample tests only.
  # `alternative' is a character string specifying the alternative hypothesis, and
  #   must be one of `"two.sided"' (default), `"greater"' or `"less"'.  
  #   Only the initial letter needs to be specified.
  # `mu' is a number indicating the null value of the location parameter 
  #   (or the difference in location parameters if performing a two-sample test).
  # `paired' indicates whether or not a two-sample test should be paired,
  #   and is ignored for a one-sample test.
  # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE,
  #   and is simulated when `all.perms' is FALSE or when
  #   computing an exact p-value requires more than num.sim calculations.
  # `num.sim' is the upper limit on the number of permutations generated.
  # If `plot' is TRUE, then plot the histogram of the permutation distribution;
  #   otherwise, list the p-value.  
  #   Setting `plot' to TRUE works best when the permutation distribution
  #   is close to continuous.
  # Some reasonable choices for `stat' are mean and median.
  # ... consists of optional arguments to `stat'; 
  #     and is the second argument to `stat' when unspecified.
  #     For example, if `stat' equals `mean', then the second argument
  #       `trim' denotes the fraction (0 to 0.5) of observations to be trimmed 
  #       from each end of `x' and `y' before the mean is computed. 
  alternative <- abbreviation(alternative, c("two.sided", "less", "greater"))
  if (!is.null(y) & !paired)   {  # Perform unpaired two-sample permutation test.
    l1 <- min(length(x), length(y));   x <- x-mu
    l2 <- max(length(x), length(y));    l3 <- l1+l2
    test.stat0 <- match.fun(stat)(x, ...)-match.fun(stat)(y, ...);  test.stat <- NULL
    if (choose(l3,l1)>num.sim) all.perms <- FALSE
    output1 <- "Unpaired two-sample permutation test was performed."
    if (all.perms) {
      output2 <- "Exact p-value was calculated."
      combos <- combn( 1:l3, l1 )
      for (i in 1:dim(combos)[2])   {
        index <- c( combos[,i], setdiff( c(1:l3), combos[,i] ) )
        test.stat <- c(test.stat,
              match.fun(stat)(c(x,y)[index[1:l1]], ...)
             -match.fun(stat)(c(x,y)[index[(l1+1):l3]], ...) )      }
    } # End of if
    else { # Use simulations.
      output2 <- paste("p-value was estimated based on", num.sim, "simulations.")
      for (i in 1:num.sim) {
        index <- sample(l3)
        test.stat <- c(test.stat,
              match.fun(stat)(c(x,y)[index[1:l1]], ...)
             -match.fun(stat)(c(x,y)[index[(l1+1):l3]], ...) )    }      }
    if (length(x)>length(y)) test.stat <- -test.stat
    if (!plot) {
      if (alternative=="less") p.value <- mean( test.stat <= test.stat0 )
      if (alternative=="greater") p.value <- mean( test.stat >= test.stat0 )
      if (alternative=="two.sided") p.value <- mean( abs(test.stat) >= abs(test.stat0) )
      structure(list(output1, output2, alternative=alternative, mu=mu, p.value=p.value))   }
    else {hist(test.stat, main="Histogram of Permutation Distribution",
                xlab="Value of test statistic", ylab="Frequency")}
    }     # END of `if (!is.null(y) & !paired)'
  else { # Perform a one-sample or a paired two-sample permutation test on mu.
    if (is.null(y)) { output1 <- "One-sample permutation test was performed." }
    else { x <- x-y;  output1 <- "Paired permutation test was performed." }
    x <- x-mu;  lx <- length(x); test.stat0 <- match.fun(stat)(x,...); test.stat <- NULL
    if (2^lx>num.sim) all.perms <- FALSE
    if (all.perms) {
      output2 <- "p-value was calculated based on all permutations."
      plus.minus.matrix <- matrix(1, 1, lx)
         for (i in lx:1)  {
            temp <- plus.minus.matrix ;     temp[,i] <- -1
            plus.minus.matrix <- rbind( plus.minus.matrix, temp )       }
      for (ii in 1:2^lx)   {
        test.stat <- c(test.stat, match.fun(mean)(x*plus.minus.matrix[ii,],...))    }
    } # End of if
    else { # Use simulations.
      output2 <- paste("p-value was estimated based on", num.sim, "simulations.")
      for (i in 1:num.sim) test.stat <- c(test.stat, match.fun(stat)((x*sample(c(-1,1),lx,T)),...)) }
    if (!plot) {
      if (alternative=="less") { p.value <- mean( test.stat <= test.stat0 ) }
      if (alternative=="greater") { p.value <- mean( test.stat >= test.stat0 ) }
      if (alternative=="two.sided") { p.value <- mean( abs(test.stat) >= abs(test.stat0) ) }
      structure(list(output1, output2, alternative=alternative, mu=mu, p.value=p.value))}
    else {hist(test.stat, main="Histogram of Permutation Distribution",
                xlab="Value of test statistic", ylab="Frequency")}
  } # End of `else'
} # End of perm.test       Garren15


# Math 324
score <- function(x, y=NULL, expon=FALSE) {
  # Generates van der Waerden scores (i.e., normal quantiles) or exponential 
  #    (similar to Savage) scores, for combined data `x' and `y'.
  # `x' is a positive integer equal to the number of desired scores when y is NULL,
  #    or `x' is a vector of observations.
  # `y' is an optional vector of observations, typically used with two-sample tests.
  # When `expon' is FALSE, van der Waerden scores are computed, even for ties.
  # When `expon' is TRUE, Exponential scores are computed,
  #    and interpolation is used for ties.
  # Output: scored values for `x', when y is NULL.
  # Output: `$x' are the scored values for `x', when y is not NULL.
  # Output: `$y' are the scored values for `y', when y is not NULL.
  if (is.null(y) & length(x)==1) { if (floor(x)==ceiling(x) & x>=1) {x <- 1:x} }
  n <- length(c(x,y)) ;    if (!expon) {z <- qnorm(rank(c(x,y))/(n+1))}
  if (expon) { u <- cumsum( 1/(n+1-1:n) ) ;   z <- u[ rank(c(x,y)) ]
    for (i in 1:n) { rank0 <- rank(c(x,y))[i] ;   remainder <- rank0 - floor(rank0)
      if (remainder>0) {z[i] <- u[floor(rank0)]*(1-remainder)+u[ceiling(rank0)]*remainder}}}
  if (is.null(y)) { return(z) }
  if (!is.null(y)) { return(structure(list(x=z[1:length(x)], y=z[(length(x)+1):n]))) }
} # Garren15


# Math 324
siegel.test <- function(x, y, alternative=c("two.sided", "less", "greater"), 
                        reverse=FALSE, all.perms=TRUE, num.sim=2e4)   {
  # Performs the Siegel-Tukey test, where ties are handled by averaging ranks,
  #   not by asymptotic approximations.
  # `x' and `y' are numeric vectors of data values.
  # `alternative' is a character string specifying the alternative hypothesis, and
  #   must be one of `"two.sided"' (default), `"greater"' or `"less"'.  
  #   Only the initial letter needs to be specified.
  # If `reverse' is FALSE, then assign rank 1 to the smallest observation.
  # If `reverse' is TRUE, then assign rank 1 to the largest observation.
  # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE,
  #   and is simulated when `all.perms' is FALSE or when
  #   computing an exact p-value requires more than num.sim calculations.
  # `num.sim' is the upper limit on the number of permutations generated.
  alternative <- abbreviation(alternative, c("two.sided", "less", "greater"))
  z <- sort(c(x,y)) ;     lz <- length(z)
  lower <- 1;  upper <- lz;  j <- rep(NA, lz)
  for (i in 1:lz)   {
    if (i-4*as.integer(i/4)<=1)  { j[lower] <- i ;   lower <- lower+1 }
    else {j[upper] <- i;   upper <- upper-1}      }
  if (reverse) { j <- rev(j) }
  z0 <- z[order(j)]
  rank.x.mat <- matrix(NA, length(x), lz)
  rank.y.mat <- matrix(NA, length(y), lz)
  for (k in 1:lz)  {
    for (i in 1:length(x))  { if (x[i]==z0[k]) {rank.x.mat[i,k] <- k} }
    for (i in 1:length(y))  { if (y[i]==z0[k]) {rank.y.mat[i,k] <- k} }    }
  rank.x <- apply(rank.x.mat,1,mean,na.rm=TRUE)
  rank.y <- apply(rank.y.mat,1,mean,na.rm=TRUE)
  if (alternative=="two.sided")     
    {p.value <- perm.test(rank.x,rank.y,alternative="two.sided",all.perms=all.perms,
                          num.sim=num.sim,stat=mean)$p.value}
  if (alternative=="less")
    {p.value <- perm.test(rank.x,rank.y,alternative="greater",all.perms=all.perms,
                          num.sim=num.sim,stat=mean)$p.value}
  if (alternative=="greater")
    {p.value <- perm.test(rank.x,rank.y,alternative="less",all.perms=all.perms,
                          num.sim=num.sim,stat=mean)$p.value}
  structure(list("   Siegel-Tukey test", alternative=alternative,
                  rank.x=rank.x, rank.y=rank.y, p.value=p.value))
} # Garren15


# Math 324
rmd.test <- function(x, y, alternative=c("two.sided", "less", "greater"), all.perms=TRUE,
                     num.sim=2e4) {
  # A permutation test is performed based on the estimated rmd,
  #   the ratio of the absolute mean deviances.
  # `x' and `y' are numeric vectors of data values.
  # `alternative' is a character string specifying the alternative hypothesis, and
  #   must be one of `"two.sided"' (default), `"greater"' or `"less"'.  
  #   Only the initial letter needs to be specified.
  # The exact p-value is attempted when `all.perms' (i.e., all permutations) is TRUE,
  #   and is simulated when `all.perms' is FALSE or when
  #   computing an exact p-value requires more than num.sim calculations.
  # `num.sim' is the upper limit on the number of permutations generated.
  max.loop <- 10;        l1 <- min(length(x), length(y)) ;    tol <- 1e-10
  alternative <- abbreviation(alternative, c("two.sided", "less", "greater"))
  l2 <- max(length(x), length(y));    l3 <- l1+l2
  x <- x-median(x) ;   y <- y-median(y)
  rmd <- function(x, y, alternative) {
    test <- mean(abs(x)) / mean(abs(y))
    if (alternative=="two.sided") {test <- max(test, 1/test)}
    return( test )      }
  test.stat0 <- rmd(x, y, alternative) ;    test.stat <- NULL
  if (choose(l3,l1)>num.sim) all.perms <- FALSE
  if (all.perms & l1<=max.loop) {
    output <- "Exact p-value was calculated."
    index <- rep(NA, max(l3, max.loop))
    for (i1 in 1:(l2+1))  {
    for (i2 in (i1+1):max( i1+1, (l2+2)*(l1>=2) ))  {
    for (i3 in (i2+1):max( i2+1, (l2+3)*(l1>=3) ))  {
    for (i4 in (i3+1):max( i3+1, (l2+4)*(l1>=4) ))  {
    for (i5 in (i4+1):max( i4+1, (l2+5)*(l1>=5) ))  {
    for (i6 in (i5+1):max( i5+1, (l2+6)*(l1>=6) ))  {
    for (i7 in (i6+1):max( i6+1, (l2+7)*(l1>=7) ))  {
    for (i8 in (i7+1):max( i7+1, (l2+8)*(l1>=8) ))  {
    for (i9 in (i8+1):max( i8+1, (l2+9)*(l1>=9) ))  {
    for (i10 in (i9+1):max( i9+1, (l2+10)*(l1>=10) ))  {
      index <- c(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10)
      index <- c( index[1:l1], setdiff(c(1:l3),  index[1:l1]) )
      test.stat <- c(test.stat, rmd( c(x,y)[index[1:l1]], c(x,y)[index[(l1+1):l3]], alternative ) )
    }}}}}}}}}} # Number of parentheses is max.loop
  } # End of if
  else { # Use simulations.
    output <- paste("p-value was estimated based on", num.sim, "simulations.")
    for (i in 1:num.sim) {
      index <- sample(l3)
      test.stat <- c(test.stat, rmd( c(x,y)[index[1:l1]], c(x,y)[index[(l1+1):l3]], alternative ) )
    }
  }
  if (length(x)>length(y) & alternative!="two.sided") { test.stat <- 1 / test.stat }
  if (alternative=="less") {
    p.value <- mean( test.stat - test.stat0 <= tol ) }
  else {
    p.value <- mean( test.stat - test.stat0 >= -tol) }
  structure(list(output, alternative=alternative, rmd.hat=test.stat0, p.value=p.value))
} # End of rmd.test       Garren15


# Math 324
perm.f.test <- function(response, treatment=NULL, num.sim=2e4)  {
  # A permutation F-test is performed, and a one-way analysis of variance F-test is performed.
  # Only approximated p-values are calculated for the permutation F-test.
  # `treatment' is the vector of treatments, and `response' is the vector of responses.
  # Alternatively, if `treatment' is NULL, then `response' must be an N by 2 matrix,
  #   such that the first column represents response and the second column represents treatment.
  # `num.sim' is the number of permutations generated.
  # If `num.sim' is smaller than one, then the permutation test is not performed;
  #   otherwise, list the p-value.
  if (is.null(treatment))   {treatment <- response[,2] ;  response <- as.numeric(response[,1])}
  if (length(response)!=length(treatment))
    stop("`response' and `treatment' must have the same length.")
  treatment <- factor(treatment) ;  list1 <- c("One-way ANOVA", summary(aov(response~treatment)))
  if (num.sim>=1) {
    all.treatments <- union(treatment, NULL); k <- length(all.treatments); ni <- NULL
    for (i in 1:k) { ni <- c(ni, sum(treatment==all.treatments[i])) }
    one.way.anova <- function(treatment, response, all.treatments, ni, k) {
         # Returns $\sum_{i=1}^k n_i{\bar Y}_i^2$, which is monotone increasing with          
         #   the F-test statistic associated with one-way ANOVA; see Higgins, 2004, p. 84.
         yi.mean <- NULL
         for (i in 1:k) {yi.mean <- c(yi.mean, mean(response[treatment==all.treatments[i]])) }
         sum(ni*yi.mean^2)   }
    test.stat0 <- one.way.anova(treatment, response, all.treatments, ni, k) ;   test.stat <- NULL
    for (i in 1:num.sim) {
      test.stat <- c(test.stat, one.way.anova(treatment, sample(response), all.treatments, ni, k)) }
    list2 <- c( "The p-value from the permutation F-test based on",
                paste( num.sim, "simulations is", mean(test.stat>=test.stat0) ) )
    list1 <- c(list1, list2) }
  return(list1) 
} # Garren15


# Math 324
perm.cor.test <- function(x, y=NULL, alternative=c("two.sided", "less", "greater"), 
                          method=c("pearson", "spearman"), num.sim=2e4 ) {
  # A permutation correlation test is performed.
  # P-values are approximated based on randomly sampling the permutations.
  #   `x' and `y' are the paired vectors of observations.
  # Alternatively, if `y' is NULL, then `x' must be an N by 2 matrix,
  # `alternative' is a character string specifying the alternative hypothesis, and
  #   must be one of `"two.sided"' (default), `"greater"' or `"less"'.  
  #   Only the initial letter needs to be specified.
  # `method' is a character string specifying the type of correlation, and
  #   must be one of `"pearson"' (default) or `"spearman"'.  
  #   Only the initial letter needs to be specified.
  # Choices for `method' are "pearson", "spearman", "kendall".
  # `num.sim' is the number of permutations generated.
  alternative <- abbreviation(alternative, c("two.sided", "less", "greater"))
  method <- abbreviation(method, c("pearson", "spearman"))
  if (is.null(y))   {y <- x[,2] ;  x <- as.numeric(x[,1])}
  if (length(x)!=length(y)) stop("`x' and `y' must have the same length.")
  if (method=="spearman")  {x <- rank(x);  y <- rank(y)}
  if (method %in% c("pearson", "spearman")) {
    test.stat0 <- cor(x,y);  test.stat <- NULL
    for (i in 1:num.sim)  {test.stat <- c(test.stat, cor(x, sample(y)))}
    if (alternative=="two.sided") p.value <- mean(abs(test.stat)>=abs(test.stat0))
    if (alternative=="less") p.value <- mean(test.stat<=test.stat0)
    if (alternative=="greater") p.value <- mean(test.stat>=test.stat0)   }
  output1 <- paste("Permutation correlation test.  Method is", method)
  output2 <- paste("p-value was estimated based on", num.sim, "simulations.")
  structure(list(output1, output2, alternative=alternative, p.value=p.value))
} # Garren15


# Consider installing packages "fastGraph" and "jmuOutlier".
the.answer <- readline("\nDo you want to install the packages\n'fastGraph' and 'jmuOutlier'? (y or n) ")
if ( substring(the.answer,1,1) %in% c("y","Y") ) {cat("\n");  install.packages( c("fastGraph","jmuOutlier"), quiet=TRUE )}
if ( suppressWarnings( library( fastGraph, logical.return=TRUE, warn.conflicts=FALSE ) ) )  {
   library( fastGraph, warn.conflicts=FALSE )
   suppressWarnings( try( rm( plotDist, shadeDist, shadePhat, plotLine, getMinMax ), silent=TRUE ) )  }
if ( suppressWarnings( library( jmuOutlier, logical.return=TRUE, warn.conflicts=FALSE ) ) )   {
   library( jmuOutlier, warn.conflicts=FALSE )
   suppressWarnings( try( rm( perm.cor.test, perm.f.test, perm.test, rmd.test, siegel.test ), silent=TRUE ) )
   suppressWarnings( try( rm( CI.t.test, quantileCI ), silent=TRUE ) )
   suppressWarnings( try( rm( fourier, lineGraph, plotCI, plotEcdf, plotVector, truncHist ), silent=TRUE ) )
   suppressWarnings( try( rm( dlaplace, plaplace, qlaplace, rlaplace, dtriang, ptriang, qtriang, rtriang ), silent=TRUE ) )
   suppressWarnings( try( rm( abbreviation, power.binom.test, score ), silent=TRUE ) )     }
if ( !suppressWarnings( library( fastGraph, logical.return=TRUE, warn.conflicts=FALSE ) ) )   
   cat("\nThe package 'fastGraph' is not installed on your computer.\n")
if ( !suppressWarnings( library( jmuOutlier, logical.return=TRUE, warn.conflicts=FALSE ) ) )   
   cat("\nThe package 'jmuOutlier' is not installed on your computer.\n")
cat("\nHelp is available via:\n")
if ( suppressWarnings( library( fastGraph, logical.return=TRUE, warn.conflicts=FALSE ) ) )   cat("> ?fastGraph\n")
if ( suppressWarnings( library( jmuOutlier, logical.return=TRUE, warn.conflicts=FALSE ) ) )  cat("> ?jmuOutlier\n")
cat("> scan2\n> read.table2\n> ls(1)\n> ls(2)  # etc.\n\n")
suppressWarnings( try( rm( the.answer ), silent=TRUE ) )