"""

The goal is to have a computationally efficient and reasonably clean
way to handle genotype data. By "genotype data", I mean:

  (a) the result of an high resolution device acquisition, e.g., what
  you get from an Affy 6.0 and an Illumina Infinium;

  (b) the result of interpolation and extrapolation of genotyping data
  to obtain even higher densities datasets.

A genotype data object GDO is composed by:

  (i)   a list of N markers ids that indentifies each snp in the datasets;

  (ii)  a type(np.zeros((2, N), dtype=np.float32)) array with the
        probabilities of the AA and BB configurations;
  (iii) a type(np.zeros((N,), dtype=np.float32)) array with the call confidence.

We are using a floating point representation, rather than the usual
'two-bits' per snp  representation because we are trying to mantain an
uniform description for both the 'experimental' and the 'computed' snp values.

All GDO coming from the same acquisition technology (both real and
synthetic) share the same marker list.

For the purposes of this discussion, what the markers are is
irrelevant. However, their details are currently handled as records of
a pytable table (omero.grid.table) with many millions of rows.

The typical operations that we expect to perform on the data can be
divided in, at least, three flavours.

 *columnar, per snp, statistics*: that is, use call values for the
  same marker across all the GDO in a given GDO set to compute a snp
  specific value, e.g., minimum allele frequency (MAF) and Hardy
  Weinberg equilibrium tests (HWE).

 *row, per GDO, statistics*: that is, use call values for all the snp
  within a GDO, for instance to count the number of
  (homo/hetero)zygotes snps.

 *group of GDO statistics*: that is, use a set to GDO to compute
  quantities that require to look at the same time in both the row and
  col directions, e.g., computing the Hamming distance (on snps)
  between pairs of GDOs, and using imputation techniques (e.g., with
  Merlin) to convert inter individual relations (pedigree)
  to deduce missing snp values.

Ideally, one would like to have server-side scripts that compute
'per-column' and 'per-row' quantities, and maybe, extract blocks of
GDOs and convert them to the format needed by external tools.  For
instance, saving in an HDFS directory a file with, for each snp, a record
containing the relative genotype for all the individuals in the
selected set.

Things like saving a new GDO will be left to the client, as well
as recovering a GDO.

As an example, below is a simplistic, pytable based, implementation of
a genotypes storage facility and two examples of operations of the first class.

"""

import numpy as np
import tables as tb
import time
from optparse import OptionParser


N=2000000 # Just a reasonable test size.
class GDOAffy6_0(tb.IsDescription):
  """

  vid  is a unique id for this GDO
  act  is a unique id of the act that generated this GDO

  probs[0,:] contains prob_AA
  probs[1,:] contains prob_BB

  prob_AB = 1.0 - prob_AA - prob_BB

  confidence comes from the measurement/computation process.

  """
  vid          = tb.StringCol(itemsize=16)
  act          = tb.StringCol(itemsize=16)
  probs        = tb.Float32Col(shape=(2, N))
  confidence   = tb.Float32Col(shape=(N,))

def create_gdos(fname, n_recs=200, p_A=0.3):
  """Just random noise"""
  fh = tb.openFile(fname, mode="w")
  root = fh.root
  for gn in ("GDOs",):
    g = fh.createGroup(root, gn)
  table = fh.createTable('/GDOs', 'affy6_0',
                         GDOAffy6_0,
                         "GDOs Affy6_0")
  sample = table.row
  probs = np.zeros((2,N), dtype=np.float32)
  for i in range(n_recs):
    sample['vid'] = 'V9482948923%05d' % i
    sample['act'] = 'V9482948923%05d' % i
    probs[0,:] = np.random.normal(p_A**2, 0.01*p_A**2, N)
    probs[1,:] = np.random.normal((1.-p_A)**2, 0.01*(1.0-p_A)**2, N)
    sample['probs'] = np.clip(probs, 0, 1.0)
    sample['confidence'] = np.random.random(N)
    sample.append()
  table.flush()
  fh.close()

#------------------------------------------------------------------------------------------
def count_homozygotes(it):
  """
  Count (compute) the number of  N_AA, N_BB homozygotes.
  """
  setup_done = False
  for i, x in enumerate(it):
    if not setup_done:
      counts = np.zeros(x['probs'].shape, dtype=np.float32)
      setup_done = True
    counts += x['probs']
  return (i + 1), np.cast[np.int32](counts)

def maf(it, counts=None):
  """
  Compute the minor allele frequencies
  """
  if not counts:
    N, counts = count_homozygotes(it)
  else:
    N, counts = counts
  N_AB = N - counts.sum(axis=0)
  return (2*counts + N_AB)/(2.0*N)

def hwe_probabilites(n_a, n_ab, N):
  """
  Quick and dirty.
  """
  n_a = n_a if n_a <= N else 2*N - n_a
  n_b = 2*N - n_a
  N_ab = np.arange(n_a & 0x01, n_a , 2, dtype=np.float64)
  log_fact = np.log((n_a - N_ab) * (n_b - N_ab) / ((N_ab + 2.0) * (N_ab + 1.0)))
  weight = np.cumsum(log_fact)
  prob = np.exp(weight - weight.max())
  prob /= prob.sum()
  return (prob[N_ab == n_ab], prob)

def hwe_scalar(n_a, n_ab, N):
  p, probs = hwe_probabilites(n_a, n_ab, N)
  return probs[probs <= p].sum()

hwe_vector = np.vectorize(hwe_scalar, [np.float32])

def hwe(it, counts=None):
  if not counts:
    N, counts = count_homozygotes(it)
  else:
    N, counts = counts
  N_AB = N - counts.sum(axis=0)
  N_x = N_AB + 2*counts
  low_freq = N_x.min(axis=0)
  return hwe_vector(low_freq, N_AB, N)


def analize_gdos(fname):
  fh = tb.openFile(fname)
  table = fh.root.GDOs.affy6_0
  start = time.clock()
  counts = count_homozygotes(table.iterrows())
  print 'time spent in count_homozygotes = ', time.clock() - start
  start = time.clock()
  s = maf(None, counts)
  print 'time spent in  maf = ', time.clock() - start
  start = time.clock()
  h = hwe(None, counts)
  print 'time spent in hwe = ', time.clock() - start
  print s[:, 0:10]
  print h[0:10]
  print h.min(), h.max()
  print np.histogram(h, bins=20)


def main():
  p = OptionParser()
  p.add_option("-c", "--create",
               action="store_true", dest="create", default=False,
               help="create h5 file")
  (opts, args) = p.parse_args()
  fname = "gdos.h5"
  if opts.create:
    create_gdos(fname)
  analize_gdos(fname)

main()