Source code for kbbq.compare_reads

"""
A mish-mash of functions for recalibration. Should probably be renamed in the near future.
"""

import pysam
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression as LR
from sklearn.isotonic import IsotonicRegression as IR
import os
import os.path
import sys
import re
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import khmer
import scipy.stats
import kbbq.recaltable as recaltable
import kbbq.plot
import pandas as pd
import datetime
import kbbq.benchmark

[docs]def tstamp():
    """
    Return the current time up to the second in ISO format as a string.

    :return: current time with brackets
    :rtype: str
    """
    return '[ ' + datetime.datetime.today().isoformat(' ', 'seconds') + ' ]'

[docs]def print_info(*args, **kwargs):
    """
    Print input to stderr prepended by a timestamp.
    """
    print(tstamp(), *args, file = sys.stderr, **kwargs)

[docs]def load_positions(posfile):
    """
    Get positions that are covered by a non-gzipped BED file.

    Pass the file name; it will be opened with :func:`python.open`.
    This is slightly different than :func:`benchmark.get_bed_dict`
    because it doesn't require a refdict and only provides a list
    of 0-based positions, not a boolean array of all positions in
    the reference. It also doesn't support gzipped files.
    """
    d = dict()
    with open(posfile, 'r') as infh:
        for line in infh:
            # bed format: pos is 0 based and end is 1 based
            chrom, pos, end = line.rstrip().split()
            for i in range(int(pos), int(end)):
                d.setdefault(chrom, list()).append(i)
    return d

[docs]def get_var_sites(vcf):
    """
    Get positions covered by any record in a VCF file.

    Pass the file name. Each value in the returned dict will
    be a list of 0-based positions.
    """
    vcf = pysam.VariantFile(vcf)
    d = dict()
    for record in vcf:
        for i in range(record.start, record.stop, 1):
            #record.start is 0-based inclusive
            #record.stop is 0-based exclusive
            d.setdefault(record.chrom, list()).append(i)
    return d

[docs]def train_regression(rawquals, corrected, tol = 1e-8):
    print(tstamp(), "Doing Logit Regression", file=sys.stderr)
    lr = LR(tol = tol, solver = 'lbfgs')
    lr = lr.fit(rawquals.flatten().reshape(-1,1), corrected.flatten())
    return lr

[docs]def regression_recalibrate(lr, q):
    print(tstamp(), "Recalibrating Quality Scores . . .", file = sys.stderr)
    newprobs = lr.predict_proba(q.flatten().reshape(-1,1))[:,1]
    newq = p_to_q(newprobs)
    newq.reshape(q.shape)
    assert newq.shape == q.shape
    return newq.copy()

[docs]def find_read_errors(read, ref, variable):
    """
    Use the CIGAR to find errors in the read or sites to skip.

    Softclipped bases will be added to the skip array.
    Returns a tuple with the error array and the skip array.
    We don't consider indel errors.
    """
    # here's how gatk does it: https://github.com/broadinstitute/gatk/blob/78df6b2f6573b3cd2807a71ec8950d7dfbc9a65d/src/main/java/org/broadinstitute/hellbender/utils/recalibration/BaseRecalibrationEngine.java#L370
    seq = np.array(list(read.query_sequence), dtype = np.unicode)
    skips = np.zeros(seq.shape, dtype = np.bool)
    cigartuples = read.cigartuples #list of tuples [(operation, length)]
    cigarops, cigarlen = zip(*cigartuples)

    #reference length from CIGAR: https://github.com/samtools/htsjdk/blob/942e3d6b4c28a8e97c457dfc89625bb403bdf83c/src/main/java/htsjdk/samtools/Cigar.java#L76
    #sum lengths of MDN=X
    #reflen = np.sum(cigarlen[np.any([cigarops == 0, cigarops == 2, cigarops == 3, cigarops == 7, cigarops == 8], axis = 0)])
    subset_variable = variable[read.reference_name][read.reference_start : read.reference_end]
    refseq = np.array(list(ref[read.reference_name][read.reference_start : read.reference_end]), dtype = np.unicode)

    readerrors = np.zeros(seq.shape, dtype = np.bool)
    readidx = 0
    refidx = 0
    for op, l in cigartuples:
        if op == 0 or op == 7 or op == 8:
            #match
            readerrors[readidx : readidx + l] = (refseq[refidx : refidx + l] != seq[readidx : readidx + l])
            skips[readidx : readidx + l] = subset_variable[refidx : refidx + l]
            readidx = readidx + l
            refidx = refidx + l
        elif op == 1:
            #insertion in read
            #gatk counts insertions as aligning to the ref base to the left, unless
            #the insertion is the first op, when it will align to the right.
            #we skip if it's skipped on both sides since we don't implement 
            #variant sites as a range
            #if we're at the end, we can just look at the base to the left
            if refidx < len(subset_variable):
                skips[readidx:readidx+l] = np.logical_and(subset_variable[refidx-1], subset_variable[refidx])
            else:
                skips[readidx:readidx+l] = subset_variable[refidx-1]
            readidx = readidx + l
        elif op == 2 or op == 3:
            #deletion in read or N op
            # N is for introns in mRNA
            skips[readidx - 1] = np.logical_or(skips[readidx-1],np.any(subset_variable[refidx: refidx + l]))
            refidx = refidx + l
        elif op == 4:
            #soft clip, consumes query not ref
            skips[readidx : readidx + l] = True
            readidx = readidx + l
        elif op == 5 or op == 6:
            #hard clip or pad, do nothing
            # it is covered, it's just optimized out
            # see https://github.com/nedbat/coveragepy/issues/198
            continue # pragma: no cover
        else:
            #unrecognized
            # also covered
            raise ValueError("Unrecognized Cigar Operation " + \
                str(op) + " In Read\n" + str(read)) #pragma: no cover
    return readerrors, skips

[docs]class RescaledNormal:
    """
    A class to cache the rescaled normal prior used in the bayesian recalibration
    model. 

    Attributes

        * :attr:`maxscore` - max score supported
        * :attr:`prior_dist` - numpy array of prior probability 

    Methods

        * :meth:`prior` - get the prior probability for a given quality score difference

    Most of these attributes are nonsense; the "proper" way to interact
    with the class is via the :attr:`prior_dist` array. If that makes you
    uncomfortable, use the provided accessor function :meth:`prior`.

    Under no circumstances should you attempt to replace any of these attributes,
    it will most likely not have the desired effect. The caching mechanism here 
    only works because the class attributes are immediately instatiated when the class
    is created, so by the time you replace them it won't matter. 
    Reimplement it yourself if you want a different prior.
    """

    oldset = np.seterr(all = 'raise')

    maxscore = 93
    """
    The maximum quality score supported by this class.
    """

    possible_diffs = np.arange(maxscore+1, dtype = np.int_)
    prior_dist = np.zeros(possible_diffs.shape[0], dtype = np.longdouble)
    for i in range(possible_diffs.shape[0]):
        try:
            prior_dist[i] = np.log(.9 * np.exp(-((possible_diffs[i]/.5)**2)/2))
        except FloatingPointError:
            prior_dist[i] = np.NINF
    np.seterr(**oldset)

[docs]    @classmethod
    def prior(cls, difference):
        """
        Return the prior probability for a given difference in quality score.

        :param int difference: The difference in quality score
        :returns: the prior probability
        :rtype: np.longdouble
        """
        return cls.prior_dist[difference]

[docs]class Dinucleotide:
    """
    A class to cache dinucleotides and maintain a consistent dinuc -> int
    map throughout the module.
    """

    nucleotides = ['A','T','G','C']
    """
    List of valid nucleotides
    """

    complement = {'A' : 'T', 'T' : 'A', 'G' : 'C', 'C' : 'G'}
    """
    Dictionary for complementing nucleotides
    """

    #dinucs = [i + j for i in nucleotides for j in nucleotides]
    #the above won't work because list comprehensions ignore
    #class scope except the outermost variable. we can use
    # a temporary function, even though it looks bad :(
    dinucs = lambda d:[i + j for i in d for j in d]
    dinucs = dinucs(nucleotides)
    """
    List of valid dinucleotides
    """

    dinuc_to_int = dict(zip(dinucs, range(len(dinucs))))
    """
    Dictionary mapping dinuc -> int
    """

    vectorized_get = np.vectorize(dinuc_to_int.get, otypes = [np.int])
    vectorized_complement = np.vectorize(complement.get, otypes = [np.unicode])

[docs]    @classmethod
    def vecget(cls, *args, **kwargs):
        return cls.vectorized_get(*args, **kwargs)

[docs]    @classmethod
    def veccomplement(cls, *args, **kwargs):
        return cls.vectorized_complement(*args, **kwargs)

[docs]def gatk_delta_q(prior_q, numerrs, numtotal, maxscore = 42):
    """
    Calculate the shift in quality scores from the prior given
    data.

    This is achieved by finding the difference between the maximum
    a posteriori and the prior point estimate of Q.
    """
    assert prior_q.shape == numerrs.shape == numtotal.shape
    possible_q = np.arange(maxscore+1, dtype = np.int)
    diff = np.absolute(np.subtract.outer(possible_q, prior_q).astype(np.int))
    #1st dim is possible qs
    prior = RescaledNormal.prior_dist[diff]
    broadcast_errs = np.broadcast_to(numerrs, possible_q.shape + numerrs.shape).copy()
    broadcast_tot = np.broadcast_to(numtotal, possible_q.shape + numtotal.shape).copy()
    p = q_to_p(possible_q).astype(np.float)
    while len(p.shape) < len(broadcast_tot.shape):
        p = np.expand_dims(p, -1)
    broadcast_p = np.broadcast_to( p, broadcast_tot.shape ).copy()
    loglike = scipy.stats.binom.logpmf(broadcast_errs+1, broadcast_tot+2, broadcast_p)
    #loglike should now have same dims as prior
    assert loglike.shape == prior.shape
    posterior = prior + loglike
    posterior_q = np.argmax(posterior, axis = 0)
    assert posterior_q.shape == prior_q.shape
    return posterior_q - prior_q

[docs]def p_to_q(p, maxscore = 42):
    q = np.zeros(p.shape, dtype = np.int)
    q[p != 0] = (-10.0*np.log10(p[p != 0])).astype(np.int) #avoid divide by 0
    q[p == 0] = maxscore
    q = np.clip(q, 0, maxscore)
    return q.copy()

[docs]def q_to_p(q):
    p = np.array(np.power(10.0,-(q / 10.0)), dtype = np.longdouble, copy = True)
    return p

## Generic covariate functions

[docs]def generic_cycle_covariate(sequencelen, secondinpair = False):
    cycle = np.arange(sequencelen)
    if secondinpair:
        cycle = np.negative(cycle + 1)
    return cycle

[docs]def generic_dinuc_covariate(sequences, quals, minscore = 6):
    #sequences should be a numpy unicode character array, quals should be a numpy array of integer quality scores
    assert sequences.shape == quals.shape
    assert sequences.dtype == np.dtype('U1')
    dinuc = np.char.add(sequences[...,:-1], sequences[...,1:])
    dinuccov = np.zeros(sequences.shape, dtype = np.int)
    dinuccov[...,0] = -1
    is_n = (sequences[...,1:] == 'N')
    follows_n = (sequences[...,:-1] == 'N')
    invalid = np.logical_or(quals[...,1:] < minscore, np.logical_or(is_n, follows_n))
    dinuccov[...,1:][...,invalid] = -1
    dinuccov[...,1:][...,np.logical_not(invalid)] = Dinucleotide.vecget(dinuc[...,np.logical_not(invalid)])
    return dinuccov

## Recalibrating FASTQ reads

[docs]def fastq_cycle_covariates(read, secondinpair = False):
    return generic_cycle_covariate(len(read.sequence), secondinpair)

[docs]def fastq_dinuc_covariates(read, minscore = 6):
    quals = np.array(read.get_quality_array(), dtype = np.int)
    return generic_dinuc_covariate(np.array(list(read.sequence)), quals, minscore)

[docs]def fastq_infer_secondinpair(read):
    namestr = read.name.split(sep='_')[0]
    return namestr[-2:] == '/2'

[docs]def fastq_infer_rg(read):
    """
    Infer the read group from appended read information, such as produced by the
    samtools fastq command.

    Requires the read to be formatted such the rg tag is added to the end of the
    read name delimited by a ``_``. Returns the read group tag.
    """
    rgstr = read.name.split(sep='_')[1]
    assert rgstr[0:2] == 'RG'
    return rgstr.split(':')[-1]

[docs]def recalibrate_fastq(read, meanq, globaldeltaq, qscoredeltaq, positiondeltaq, dinucdeltaq, rg, dinuc_to_int, secondinpair = False, minscore = 6, maxscore = 42):
    qcov = np.array(read.get_quality_array(), dtype = np.int)
    recalibrated_quals = np.array(qcov, copy = True, dtype = np.int)
    valid_positions = (qcov >= minscore)
    qcov = qcov[valid_positions]
    cycle = fastq_cycle_covariates(read, secondinpair)[valid_positions]
    dinuccov = fastq_dinuc_covariates(read, minscore)[valid_positions]
    recalibrated_quals[valid_positions] = (meanq[rg] + globaldeltaq[rg] + qscoredeltaq[rg, qcov] + dinucdeltaq[rg, qcov, dinuccov] + positiondeltaq[rg, qcov, cycle]).astype(np.int)
    return recalibrated_quals

## Recalibrate reads from a BAM

[docs]def bamread_get_oq(read):
    #TODO: add an assert or logic to ensure OQ is present
    oq = np.array(list(read.get_tag('OQ')), dtype = np.unicode)
    #quals = pysam.qualitystring_to_array(read.get_tag('OQ')) #this may be faster
    quals = np.array(oq.view(np.uint32) - 33, dtype = np.uint32)
    return np.array(quals, dtype = np.int)

[docs]def get_rg_to_pu(bamfileobj):
    rg_to_pu = {rg['ID'] : rg['PU'] for rg in bamfileobj.header.as_dict()['RG']}
    return rg_to_pu

[docs]def nparray_to_qualitystring(a, offset = 33):
    return (a + offset).astype(np.uint32).tostring().decode('utf32')