#$Author: sinnwell $
#$Date: 2007/02/23 14:48:04 $
#$Header: /people/biostat3/sinnwell/Projects/LDpairs/Make/RCS/ld.pairs.q,v 1.8 2007/02/23 14:48:04 sinnwell Exp $
#$Locker:  $
#$Log: ld.pairs.q,v $
#Revision 1.8  2007/02/23 14:48:04  sinnwell
#add r2 computation for snps only
#
#Revision 1.7  2007/02/21 21:38:04  sinnwell
#add r2 for SNP=TRUE
#
#Revision 1.6  2005/08/12 18:42:59  sinnwell
#class for R, error msg for df=0 in pchisq.
#
#Revision 1.5  2004/06/14 13:48:20  sinnwell
#control is em.control
#
#Revision 1.4  2004/06/11 16:29:23  sinnwell
#add em.control as parameter
#
#Revision 1.3  2004/01/27 20:57:29  sinnwell
#minor fix on comments
#
#Revision 1.2  2003/10/14 19:51:42  sinnwell
#second revision, added keywords
#

# Schaid, DJ  Sinnwell, JP    Mayo Clinic Rochester, 2003

ld.pairs <- function(geno,
                     locus.label=NA,
                     miss.val=c(0,NA),
                     r2=FALSE,
                     em.control=haplo.em.control()) {

   # Compute D' statistic for all pairs of loci according to
   # the following algorithm.
   #
   # 1) For loci i and j,  use EM algorithm to compute all possible
   #    haplotype probabilities. From these probabilities, compute
   #    delta_ij = hprob_ij - p_i * p_j, where hprob_ij is the haplotype prob
   #    and p_i, p_j, are the allele frequencies for alleles i and j at the two
   #    loci. 
   # 2) Compute dprime_ij = delta_ij / max(delta_ij)
   # 3) Compute average dprime, over all pairs of alleles from the two loci,
   #    using two different weights:
   #
   #    dprime.ave1 is weighted by haplotype probs when no LD (wt = p_i * p_j),
   #                as described by Hedrick 1987
   #    dprime.ave2 is weighted by the haplotype prob hprob_ij, as Hendrick
   #                refers to as a method used by Cavalli-Sforza.

   # r2 added by JP Sinnwell 2/2007
  
   # Checks on geno
   n.loci <- ncol(geno)/2
   if(n.loci != (floor(ncol(geno)/2)) )stop("Odd number of cols of geno")
   if(any(is.na(locus.label))) locus.label <- paste("loc",1:n.loci,sep="-")
   if(length(locus.label)!=n.loci) stop("Wrong length for locus.label")

   #recode alleles starting from 1, 2, etc; and all miss.val to NA
   geno <- geno.recode(geno,miss.val=miss.val)$grec

   odd.col <- seq(from=1,by=2, length=n.loci)
   even.col <- odd.col + 1

   loc1.label <- NULL
   loc2.label <- NULL
   dprime.ave <- NULL
   r2.vec <- NULL
   
   lr.stat.vec <- lr.df.vec <- NULL
   comp.stat.vec <- comp.df.vec <- NULL
   ivec <- NULL
   jvec <- NULL

   for(i in 1:(n.loci-1)){
      for(j in (i+1):n.loci){
         which.col <- c(odd.col[i],even.col[i],odd.col[j],even.col[j])
         tmp.geno <- geno[,which.col]

         # need to subset tmp.geno to remove rows with any missing values, but need to account
         # for how missing values are coded (using miss.val) - see old haplo.em code which
         # eliminated rows with any missing alleles. Since we do not use the actual labels of alleles,
         # we could do the recode of alleles here, then eliminate rows with any missing values, then 
         # pass this new recoded geno matrix (with 4 col's) to haplo.em, with miss.val=NA (the 
         # code for missing values in the recoded alleles)

         tmp <- haplo.em(tmp.geno, locus.label=locus.label[c(i,j)], miss.val=miss.val, control=em.control)

        # need to check for convergence - give warning if not, and report the pair of loci
        # that it failed on
         
        if(tmp$converge==0){
          stop(paste("haplo.em failed to converge for loci ",i," and ",j,sep=" "))
        }
         
         lr.stat.vec <- c(lr.stat.vec, tmp$lr)
         lr.df.vec <- c(lr.df.vec, tmp$df.lr)

         df1 <- data.frame(tmp$haplotype,tmp$hap.prob)
         names(df1) <- c("loc1","loc2","hprob")

         # compute marginal (allele) frequencies

         loc1<- tapply(df1$hprob,df1$loc1,sum)
         loc2 <- tapply(df1$hprob,df1$loc2,sum)

         # compute expected haplotype frequencies if no LD (product of allele freqs)

         tbl <- outer(loc1,loc2,"*")
         nallele.loc1 <- length(loc1)
         nallele.loc2 <- length(loc2)

         # create data frame of loc1, loc2, and the corresponding allele frequencies, when arranged as 
         # all possible pairs of alleles between the two loci

         df2 <- data.frame(loc1=rep(names(loc1),nallele.loc2),
                           loc2=rep(names(loc2),rep(nallele.loc1,nallele.loc2)),
                           p.1=rep(loc1,nallele.loc2),
                           p.2=rep(loc2,rep(nallele.loc1,nallele.loc2)))

         # now merge dataframe with haplotype probabilities (df1) with dataframe for allele frequencies
         # (df2), keeping only rows that reside in df1 (don't need expected haplotypes)

         df3 <- merge(df1,df2,by=1:2, all=TRUE)

         # haplo.em computes haplotype frequencies only for those haplotypes that are possible to occur in
         # geno. When haplotypes do not occur in geno, they are not listed. In contrast, df2 contains haplotypes
         # under the null of no LD, and lists all possible haplotypes. To compute delta = h - p*q, where
         # h is the estimated haplotype frequencies, p and q the allele frequencies, we need to set h = 0 if
         # the corresponding haplotype never occured.

         df3$hprob <- ifelse(is.na(df3$hprob),0,df3$hprob)

         delta.min <- apply( cbind( - df3$p.1 * df3$p.2, -(1-df3$p.1) * (1-df3$p.2) ), 1, max)
         delta.max <- apply( cbind( (1-df3$p.1)*df3$p.2,       df3$p.1*(1-df3$p.2) ), 1, min)
         delta <- df3$hprob - df3$p.1 * df3$p.2
         
         dprime <- ifelse(delta < 0, delta/delta.min, delta/delta.max)

         dprime.ave  <- c(dprime.ave, sum(dprime*df3$p.1*df3$p.2))

         # r-squared only applies if 2 alleles
         if(r2) {
           if(nallele.loc1 == 2 && nallele.loc2 == 2)
             r2.vec <- c(r2.vec, (delta[1]^2)/prod(c(loc1,loc2)))
           else
             stop("User requested r2, but not all loci have 2 alleles.\n")
         }
         
         # composite LD stat

         tmp <- compositeLD(tmp.geno[,1],tmp.geno[,2],tmp.geno[,3],tmp.geno[,4])
         comp.stat.vec <- c(comp.stat.vec,tmp$chistat.global)
         comp.df.vec <- c(comp.df.vec,tmp$df)

         loc1.label <- c(loc1.label,locus.label[i])
         loc2.label <- c(loc2.label,locus.label[j])
         ivec <- c(ivec,i)
         jvec <- c(jvec,j)

       }
    }

  # now create a dataframe of all pairs of loci. loc1 and loc2 are the locus indices, their labels
  # are loc1.label and loc2.label, and the average dprimes are dprime.ave1 and dprime.ave2
   if(any(c(lr.df.vec, comp.df.vec)==0))
     warning("One or more degrees of freedom have value zero (0). \n")

   # block warnings for when pchisq doesn't like 0 or negative deg-fred.
   options(warn = -1)
    lr.pval <- 1-pchisq(lr.stat.vec, lr.df.vec)
    comp.pval <- 1-pchisq(comp.stat.vec, comp.df.vec)
   options(warn = 0)

   df <- list(loc1=ivec,loc2=jvec,loc1.label=loc1.label,loc2.label=loc2.label,
              dprime.ave=dprime.ave, r2=r2.vec, lr.stat=lr.stat.vec,lr.df=lr.df.vec,lr.pval=lr.pval,
              comp.stat=comp.stat.vec, comp.df=comp.df.vec, comp.pval=comp.pval)

   class(df) <- "ld.pairs"


   return(df)

}
