Source code for kbbq.compare_reads

"""
A mish-mash of functions for recalibration. Should probably be renamed in the near future.
"""

import pysam
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression as LR
from sklearn.isotonic import IsotonicRegression as IR
import os
import os.path
import sys
import re
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import khmer
import scipy.stats
import kbbq.recaltable as recaltable
import kbbq.plot
import pandas as pd
import datetime
import kbbq.benchmark

[docs]def tstamp(): """ Return the current time up to the second in ISO format as a string. :return: current time with brackets :rtype: str """ return '[ ' + datetime.datetime.today().isoformat(' ', 'seconds') + ' ]'
[docs]def load_positions(posfile): """ Get positions that are covered by a non-gzipped BED file. Pass the file name; it will be opened with :func:`python.open`. This is slightly different than :func:`benchmark.get_bed_dict` because it doesn't require a refdict and only provides a list of 0-based positions, not a boolean array of all positions in the reference. It also doesn't support gzipped files. """ d = dict() with open(posfile, 'r') as infh: for line in infh: # bed format: pos is 0 based and end is 1 based chrom, pos, end = line.rstrip().split() for i in range(int(pos), int(end)): d.setdefault(chrom, list()).append(i) return d
[docs]def get_var_sites(vcf): """ Get positions covered by any record in a VCF file. Pass the file name. Each value in the returned dict will be a list of 0-based positions. """ vcf = pysam.VariantFile(vcf) d = dict() for record in vcf: for i in range(record.start, record.stop, 1): #record.start is 0-based inclusive #record.stop is 0-based exclusive d.setdefault(record.chrom, list()).append(i) return d
[docs]def train_regression(rawquals, corrected, tol = 1e-8): print(tstamp(), "Doing Logit Regression", file=sys.stderr) lr = LR(tol = tol, solver = 'lbfgs') lr = lr.fit(rawquals.flatten().reshape(-1,1), corrected.flatten()) return lr
[docs]def regression_recalibrate(lr, q): print(tstamp(), "Recalibrating Quality Scores . . .", file = sys.stderr) newprobs = lr.predict_proba(q.flatten().reshape(-1,1))[:,1] newq = p_to_q(newprobs) newq.reshape(q.shape) assert newq.shape == q.shape return newq.copy()
[docs]def find_read_errors(read, ref, variable): """ Use the CIGAR to find errors in the read or sites to skip. Softclipped bases will be added to the skip array. Returns a tuple with the error array and the skip array. We don't consider indel errors. """ # here's how gatk does it: https://github.com/broadinstitute/gatk/blob/78df6b2f6573b3cd2807a71ec8950d7dfbc9a65d/src/main/java/org/broadinstitute/hellbender/utils/recalibration/BaseRecalibrationEngine.java#L370 seq = np.array(list(read.query_sequence), dtype = np.unicode) skips = np.zeros(seq.shape, dtype = np.bool) cigartuples = read.cigartuples #list of tuples [(operation, length)] cigarops, cigarlen = zip(*cigartuples) #reference length from CIGAR: https://github.com/samtools/htsjdk/blob/942e3d6b4c28a8e97c457dfc89625bb403bdf83c/src/main/java/htsjdk/samtools/Cigar.java#L76 #sum lengths of MDN=X #reflen = np.sum(cigarlen[np.any([cigarops == 0, cigarops == 2, cigarops == 3, cigarops == 7, cigarops == 8], axis = 0)]) subset_variable = variable[read.reference_name][read.reference_start : read.reference_end] refseq = np.array(list(ref[read.reference_name][read.reference_start : read.reference_end]), dtype = np.unicode) readerrors = np.zeros(seq.shape, dtype = np.bool) readidx = 0 refidx = 0 for op, l in cigartuples: if op == 0 or op == 7 or op == 8: #match readerrors[readidx : readidx + l] = (refseq[refidx : refidx + l] != seq[readidx : readidx + l]) skips[readidx : readidx + l] = subset_variable[refidx : refidx + l] readidx = readidx + l refidx = refidx + l elif op == 1: #insertion in read #gatk counts insertions as aligning to the ref base to the left, unless #the insertion is the first op, when it will align to the right. #we skip if it's skipped on both sides since we don't implement #variant sites as a range #if we're at the end, we can just look at the base to the left if refidx < len(subset_variable): skips[readidx:readidx+l] = np.logical_and(subset_variable[refidx-1], subset_variable[refidx]) else: skips[readidx:readidx+l] = subset_variable[refidx-1] readidx = readidx + l elif op == 2 or op == 3: #deletion in read or N op # N is for introns in mRNA skips[readidx - 1] = np.logical_or(skips[readidx-1],np.any(subset_variable[refidx: refidx + l])) refidx = refidx + l elif op == 4: #soft clip, consumes query not ref skips[readidx : readidx + l] = True readidx = readidx + l elif op == 5 or op == 6: #hard clip or pad, do nothing # it is covered, it's just optimized out # see https://github.com/nedbat/coveragepy/issues/198 continue # pragma: no cover else: #unrecognized # also covered raise ValueError("Unrecognized Cigar Operation " + \ str(op) + " In Read\n" + str(read)) #pragma: no cover return readerrors, skips
[docs]class RescaledNormal: """ A class to cache the rescaled normal prior used in the bayesian recalibration model. Attributes * :attr:`maxscore` - max score supported * :attr:`prior_dist` - numpy array of prior probability Methods * :meth:`prior` - get the prior probability for a given quality score difference Most of these attributes are nonsense; the "proper" way to interact with the class is via the :attr:`prior_dist` array. If that makes you uncomfortable, use the provided accessor function :meth:`prior`. Under no circumstances should you attempt to replace any of these attributes, it will most likely not have the desired effect. The caching mechanism here only works because the class attributes are immediately instatiated when the class is created, so by the time you replace them it won't matter. Reimplement it yourself if you want a different prior. """ oldset = np.seterr(all = 'raise') maxscore = 93 """ The maximum quality score supported by this class. """ possible_diffs = np.arange(maxscore+1, dtype = np.int_) prior_dist = np.zeros(possible_diffs.shape[0], dtype = np.longdouble) for i in range(possible_diffs.shape[0]): try: prior_dist[i] = np.log(.9 * np.exp(-((possible_diffs[i]/.5)**2)/2)) except FloatingPointError: prior_dist[i] = np.NINF np.seterr(**oldset)
[docs] @classmethod def prior(cls, difference): """ Return the prior probability for a given difference in quality score. :param int difference: The difference in quality score :returns: the prior probability :rtype: np.longdouble """ return cls.prior_dist[difference]
[docs]class Dinucleotide: """ A class to cache dinucleotides and maintain a consistent dinuc -> int map throughout the module. """ nucleotides = ['A','T','G','C'] """ List of valid nucleotides """ complement = {'A' : 'T', 'T' : 'A', 'G' : 'C', 'C' : 'G'} """ Dictionary for complementing nucleotides """ #dinucs = [i + j for i in nucleotides for j in nucleotides] #the above won't work because list comprehensions ignore #class scope except the outermost variable. we can use # a temporary function, even though it looks bad :( dinucs = lambda d:[i + j for i in d for j in d] dinucs = dinucs(nucleotides) """ List of valid dinucleotides """ dinuc_to_int = dict(zip(dinucs, range(len(dinucs)))) """ Dictionary mapping dinuc -> int """ vectorized_get = np.vectorize(dinuc_to_int.get, otypes = [np.int]) vectorized_complement = np.vectorize(complement.get, otypes = [np.unicode])
[docs] @classmethod def vecget(cls, *args, **kwargs): return cls.vectorized_get(*args, **kwargs)
[docs] @classmethod def veccomplement(cls, *args, **kwargs): return cls.vectorized_complement(*args, **kwargs)
[docs]def gatk_delta_q(prior_q, numerrs, numtotal, maxscore = 42): """ Calculate the shift in quality scores from the prior given data. This is achieved by finding the difference between the maximum a posteriori and the prior point estimate of Q. """ assert prior_q.shape == numerrs.shape == numtotal.shape possible_q = np.arange(maxscore+1, dtype = np.int) diff = np.absolute(np.subtract.outer(possible_q, prior_q).astype(np.int)) #1st dim is possible qs prior = RescaledNormal.prior_dist[diff] broadcast_errs = np.broadcast_to(numerrs, possible_q.shape + numerrs.shape).copy() broadcast_tot = np.broadcast_to(numtotal, possible_q.shape + numtotal.shape).copy() p = q_to_p(possible_q).astype(np.float) while len(p.shape) < len(broadcast_tot.shape): p = np.expand_dims(p, -1) broadcast_p = np.broadcast_to( p, broadcast_tot.shape ).copy() loglike = scipy.stats.binom.logpmf(broadcast_errs+1, broadcast_tot+2, broadcast_p) #loglike should now have same dims as prior assert loglike.shape == prior.shape posterior = prior + loglike posterior_q = np.argmax(posterior, axis = 0) assert posterior_q.shape == prior_q.shape return posterior_q - prior_q
[docs]def p_to_q(p, maxscore = 42): q = np.zeros(p.shape, dtype = np.int) q[p != 0] = (-10.0*np.log10(p[p != 0])).astype(np.int) #avoid divide by 0 q[p == 0] = maxscore q = np.clip(q, 0, maxscore) return q.copy()
[docs]def q_to_p(q): p = np.array(np.power(10.0,-(q / 10.0)), dtype = np.longdouble, copy = True) return p
## Generic covariate functions
[docs]def generic_cycle_covariate(sequencelen, secondinpair = False): cycle = np.arange(sequencelen) if secondinpair: cycle = np.negative(cycle + 1) return cycle
[docs]def generic_dinuc_covariate(sequences, quals, minscore = 6): #sequences should be a numpy unicode character array, quals should be a numpy array of integer quality scores assert sequences.shape == quals.shape assert sequences.dtype == np.dtype('U1') dinuc = np.char.add(sequences[...,:-1], sequences[...,1:]) dinuccov = np.zeros(sequences.shape, dtype = np.int) dinuccov[...,0] = -1 is_n = (sequences[...,1:] == 'N') follows_n = (sequences[...,:-1] == 'N') invalid = np.logical_or(quals[...,1:] < minscore, np.logical_or(is_n, follows_n)) dinuccov[...,1:][...,invalid] = -1 dinuccov[...,1:][...,np.logical_not(invalid)] = Dinucleotide.vecget(dinuc[...,np.logical_not(invalid)]) return dinuccov
## Recalibrating FASTQ reads
[docs]def fastq_cycle_covariates(read, secondinpair = False): return generic_cycle_covariate(len(read.sequence), secondinpair)
[docs]def fastq_dinuc_covariates(read, minscore = 6): quals = np.array(read.get_quality_array(), dtype = np.int) return generic_dinuc_covariate(np.array(list(read.sequence)), quals, minscore)
[docs]def fastq_infer_secondinpair(read): namestr = read.name.split(sep='_')[0] return namestr[-2:] == '/2'
[docs]def fastq_infer_rg(read): """ Infer the read group from appended read information, such as produced by the samtools fastq command. Requires the read to be formatted such the rg tag is added to the end of the read name delimited by a ``_``. Returns the read group tag. """ rgstr = read.name.split(sep='_')[1] assert rgstr[0:2] == 'RG' return rgstr.split(':')[-1]
[docs]def recalibrate_fastq(read, meanq, globaldeltaq, qscoredeltaq, positiondeltaq, dinucdeltaq, rg, dinuc_to_int, secondinpair = False, minscore = 6, maxscore = 42): qcov = np.array(read.get_quality_array(), dtype = np.int) recalibrated_quals = np.array(qcov, copy = True, dtype = np.int) valid_positions = (qcov >= minscore) qcov = qcov[valid_positions] cycle = fastq_cycle_covariates(read, secondinpair)[valid_positions] dinuccov = fastq_dinuc_covariates(read, minscore)[valid_positions] recalibrated_quals[valid_positions] = (meanq[rg] + globaldeltaq[rg] + qscoredeltaq[rg, qcov] + dinucdeltaq[rg, qcov, dinuccov] + positiondeltaq[rg, qcov, cycle]).astype(np.int) return recalibrated_quals
## Recalibrate reads from a BAM
[docs]def bamread_get_oq(read): #TODO: add an assert or logic to ensure OQ is present oq = np.array(list(read.get_tag('OQ')), dtype = np.unicode) #quals = pysam.qualitystring_to_array(read.get_tag('OQ')) #this may be faster quals = np.array(oq.view(np.uint32) - 33, dtype = np.uint32) return np.array(quals, dtype = np.int)
[docs]def get_rg_to_pu(bamfileobj): rg_to_pu = {rg['ID'] : rg['PU'] for rg in bamfileobj.header.as_dict()['RG']} return rg_to_pu
[docs]def nparray_to_qualitystring(a, offset = 33): return (a + offset).astype(np.uint32).tostring().decode('utf32')