"""
Utilities for emulating GATK's BQSR tool.
BQSR model construction hard clips soft clips and
trims adaptors. ApplyBQSR does not. So we need
different functions for each.
"""
import pysam
import numpy as np
import pandas as pd
import scipy.stats
from .. import compare_reads as utils
from .. import recaltable
###############################
# Covariate Functions
###############################
[docs]def bamread_bqsr_cycle(read):
fullcycle = np.zeros(read.query_length, dtype = np.int) #full length
cycle = utils.generic_cycle_covariate(read.query_alignment_length, read.is_read2) #excludes soft-clipped bases!
#soft-clipped bases will be skipped in other code
#so it's no problem that some cycles will stay at 0
if read.is_reverse:
cycle = np.flip(cycle)
fullcycle[read.query_alignment_start:read.query_alignment_end] = cycle
return fullcycle
[docs]def bamread_bqsr_dinuc(read, use_oq = True, minscore = 6):
unclipped_start = read.query_alignment_start
unclipped_end = read.query_alignment_end
seq = read.query_sequence[unclipped_start:unclipped_end]
quals = (utils.bamread_get_oq(read) if use_oq else np.array(read.query_qualities, dtype = np.int))
quals = quals[unclipped_start:unclipped_end]
if read.is_reverse:
seq = ''.join([utils.Dinucleotide.complement.get(x,'N') for x in reversed(seq)])
quals = np.flip(quals)
fulldinuc = np.zeros(read.query_length, dtype = np.int)
dinuccov = utils.generic_dinuc_covariate(
np.array(list(seq), dtype = 'U1'),
quals, minscore).copy()
if read.is_reverse:
# flip back to fwd coordinates
dinuccov = np.flip(dinuccov)
fulldinuc[unclipped_start:unclipped_end] = dinuccov
return fulldinuc
[docs]def bam_to_bqsr_covariates(bamfileobj, fastafilename, var_pos, minscore = 6, maxscore = 42):
"""
Given a BAM file object, FASTA reference file name and var_pos dict,
get the standard covariate arrays.
"""
rg_to_pu = utils.get_rg_to_pu(bamfileobj)
nrgs = len(rg_to_pu.keys())
rg_to_int = dict(zip(rg_to_pu, range(len(rg_to_pu))))
fasta = pysam.FastaFile(fastafilename)
#the below can probably be spun out to a function, i think we only use fullskips
ref = {chrom : np.array(list(fasta.fetch(reference = chrom)), dtype = np.unicode) for chrom in fasta.references}
varsites = {chrom : np.array(var_pos[chrom], dtype = np.int) for chrom in var_pos.keys()}
fullskips = {chrom : np.zeros(len(ref[chrom]), dtype = np.bool) for chrom in ref.keys()}
for chrom in fullskips.keys():
variable_positions = varsites[chrom]
fullskips[chrom][variable_positions] = True
nreads = np.sum([s.total for s in bamfileobj.get_index_statistics()])
read = next(bamfileobj)
seqlen = len(read.query_qualities)
rgs = np.zeros(seqlen, dtype = np.int_)
meanq = np.zeros(nrgs, dtype = np.int_)
expected_errs = np.zeros(nrgs, dtype = np.longdouble)
rg_errs = np.zeros(nrgs, dtype = np.int_)
rg_total = np.zeros(nrgs, dtype = np.int_)
q_errs = np.zeros((nrgs, maxscore + 1), dtype = np.int_)
q_total = np.zeros((nrgs, maxscore + 1), dtype = np.int_)
pos_errs = np.zeros((nrgs, maxscore + 1, 2 * seqlen), dtype = np.int_)
pos_total = np.zeros((nrgs, maxscore + 1, 2 * seqlen), dtype = np.int_)
dinuc_errs = np.zeros((nrgs, maxscore + 1, 16), dtype = np.int_)
dinuc_total = np.zeros((nrgs, maxscore + 1, 16), dtype = np.int_)
try:
while True:
seq = read.query_sequence
rgs[:] = rg_to_int[read.get_tag('RG')]
errors, skips = utils.find_read_errors(read, ref, fullskips)
q = utils.bamread_get_oq(read)
pos = bamread_bqsr_cycle(read)
dinucleotide = bamread_bqsr_dinuc(read)
seq = np.array(list(seq), dtype = 'U1')
trimmed = trim_bamread(read)
skips[q < minscore] = True
skips[trimmed] = True
skips[seq == 'N'] = True
valid = np.logical_not(skips)
dinuc_valid = np.logical_and(dinucleotide != -1, valid)
e_and_valid = np.logical_and(errors, valid)
e_and_dvalid = np.logical_and(errors, dinuc_valid)
rge = rgs[e_and_valid]
rgv = rgs[valid]
qe = q[e_and_valid]
qv = q[valid]
np.add.at(expected_errs, rgv, utils.q_to_p(qv))
np.add.at(rg_errs, rge, 1)
np.add.at(rg_total, rgv, 1)
np.add.at(q_errs, (rge, qe), 1)
np.add.at(q_total, (rgv, qv), 1)
np.add.at(pos_errs, (rge, qe, pos[e_and_valid]), 1)
np.add.at(pos_total, (rgv, qv, pos[valid]), 1)
np.add.at(dinuc_errs, (rgs[e_and_dvalid], q[e_and_dvalid], dinucleotide[e_and_dvalid]), 1)
np.add.at(dinuc_total, (rgs[dinuc_valid], q[dinuc_valid], dinucleotide[dinuc_valid]), 1)
read = next(bamfileobj)
except StopIteration:
pass
meanq = utils.p_to_q(expected_errs / rg_total)
return meanq, rg_errs, rg_total, q_errs, q_total, pos_errs, pos_total, dinuc_errs, dinuc_total
#############################################
# Trimming Functions
#############################################
[docs]def bamread_adaptor_boundary(read):
#https://github.com/broadinstitute/gatk/blob/43b2b3bd4e723552414b32b8b2a7341b81f1f688/src/main/java/org/broadinstitute/hellbender/utils/read/ReadUtils.java#L534
#0-based in ref coordinates
if ( read.tlen == 0 or
not read.is_paired or
read.is_unmapped or
read.mate_is_unmapped or
read.is_reverse == read.mate_is_reverse):
return None
if read.is_reverse:
#next_reference_start is 1-based
#reference_start is 0-based
#reference_end is 0-based but points to 1 past the last base
# (so essentially it's 1-based)
if (read.reference_end - 1) > (read.next_reference_start):
#"well-formed" insert len
return read.next_reference_start - 1
else:
return None
else:
if read.reference_start <= (read.next_reference_start + read.tlen):
#"well-formed" insert len
return read.reference_start + abs(read.tlen)
else:
return None
[docs]def trim_bamread(read):
#https://github.com/broadinstitute/gatk/blob/b11abd12b7305767ed505a8ff644a63659abf2cd/src/main/java/org/broadinstitute/hellbender/utils/clipping/ReadClipper.java#L388
#return an array of seqlen which includes bases to skip
#next_reference_start is 0-based
#reference_start is 0-based
#reference_end is 0-based but points to 1 past the last base
# (so essentially it's 1-based)
adaptor_boundary = bamread_adaptor_boundary(read)
skips = np.zeros(len(read.query_qualities), dtype = np.bool)
if adaptor_boundary is None:
return skips
else:
if read.is_reverse:
if adaptor_boundary >= read.reference_start:
#clip from start (left)
#we need to get the boundary in read coordinates rather than ref
boundary_reached = False
for readidx, refidx in reversed(read.get_aligned_pairs()):
if refidx is not None and refidx <= adaptor_boundary:
boundary_reached = True
if boundary_reached and readidx is not None:
adaptoridx = readidx + 1 #slice syntax
break
else:
#couldn't find boundary
#I think this can only happen if the boundary lies in a
#deletion that covers the rest of the read.
adaptoridx = 0
skips[:adaptoridx] = True #skip first x bases
return skips
else:
if adaptor_boundary <= (read.reference_end - 1):
#clip from end (right)
#reference_end is 1 past the end
#reference_end - 1 - adaptor_boundary + 1
boundary_reached = False
for readidx, refidx in read.get_aligned_pairs():
if refidx is not None and refidx >= adaptor_boundary:
boundary_reached = True
if boundary_reached and readidx is not None:
adaptoridx = readidx
break
else:
#couldn't find boundary
#I think this can only happen if the boundary lies in a
#deletion that covers the rest of the read.
adaptoridx = len(skips)
skips[adaptoridx:] = True #skip last x bases
return skips
##################################################
# RecalibrationReport Creation Functions
##################################################
[docs]def quantize(q_errs, q_total, maxscore = 93):
"""
This function doesn't match the GATK version, but
it's not used so it's not a priority.
"""
qe = np.sum(q_errs, axis = 0)
qt = np.sum(q_total, axis = 0)
unobserved = (qt == 0)
quantizer = np.arange(maxscore + 1)
quantizer[0:qt.shape[0]][unobserved] = maxscore
quantizer[qt.shape[0]:] = maxscore
return quantizer
[docs]def vectors_to_report(meanq, global_errs, global_total, q_errs, q_total,
pos_errs, pos_total, dinuc_errs, dinuc_total, rg_order, maxscore = 42):
"""
Turn the set of recalibration vectors into a
:class:`kbbq.recaltable.RecalibrationReport` object.
For the recalibration vectors, each dimension corresponds to a covariate.
The first index is always the read group, and the second (if it exists)
represents the raw quality score, the final index is either the cycle or
dinucleotide covariate.
:param meanq: Mean q for each read group
:type meanq: :class:`numpy.ndarray` [:]
:param global_errs: Number of errors for each read group
:type global_errs: :class:`numpy.ndarray` [:]
:param global_total: Number of observations for each read group
:type global_total: :class:`numpy.ndarray` [:]
:param q_errs: Number of errors for each read group and q score subset.
:type q_errs: :class:`numpy.ndarray` [:,:]
:param q_total: Number of observations for each read group and q score subset.
:type q_total: :class:`numpy.ndarray` [:,:]
:param pos_errs: Number of errors for each read group, q, and cycle subset.
:type pos_errs: :class:`numpy.ndarray` [:,:,:]
:param pos_total: Number of observations for each read group, q, and cycle subset.
:type pos_total: :class:`numpy.ndarray` [:,:,:]
:param dinuc_errs: Number of errors for each read group, q, and dinucleotide subset.
:type dinuc_errs: :class:`numpy.ndarray` [:,:,:]
:param dinuc_total: Number of observations for each read group, q, and dinucleotide
subset.
:type dinuc_total: :class:`numpy.ndarray` [:,:,:]
:param rg_order: The order of read groups
:type rg_order: list(str)
:param int maxscore: The maximum possible quality score
:return: the recalibration table
:rtype: :class:`kbbq.recaltable.RecalibrationReport`
"""
#these will be mostly default values, except quantization
#which I don't attempt to implement.
#I'm afraid bad things will happen if I don't include at least null values
#for all the args so I'll just include them all.
#This may need to be cleaned up later.
args = {
'binary_tag_name' : 'null',
'covariate' : 'ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate',
'default_platform' : 'null',
'deletions_default_quality' : '45',
'force_platform' : 'null',
'indels_context_size' : '3',
'insertions_default_quality' : '45',
'low_quality_tail' : '2',
'maximum_cycle_value' : '500',
'mismatches_context_size' : '2',
'mismatches_default_quality' : '-1',
'no_standard_covs' : 'false',
'quantizing_levels' : '16',
'recalibration_report' : 'null',
'run_without_dbsnp' : 'false',
'solid_nocall_strategy' : 'THROW_EXCEPTION',
'solid_recal_mode' : 'SET_Q_ZERO'
}
argdata = {'Argument' : list(args.keys()),
'Value' : list(args.values())
}
argtable = pd.DataFrame(data = argdata)
rg_est_q = -10.0 * np.log10(np.sum(utils.q_to_p(np.arange(q_total.shape[1])) * q_total, axis = 1) / global_total).round(decimals = 5).astype(np.float)
rg_est_q[np.isnan(rg_est_q)] = 0
rgdata = {'ReadGroup' : rg_order,
'EventType' : 'M',
'EmpiricalQuality' : (utils.gatk_delta_q(rg_est_q, global_errs.copy(), global_total.copy()) + rg_est_q).astype(np.float),
'EstimatedQReported' : rg_est_q,
'Observations' : global_total,
'Errors' : global_errs.astype(np.float)
}
rgtable = pd.DataFrame(data = rgdata)
rgtable = rgtable[rgtable.Observations != 0]
qualscore = np.broadcast_to(np.arange(q_total.shape[1]), (q_total.shape)).copy()
qualdata = {'ReadGroup' : np.repeat(rg_order, q_total.shape[1]),
'QualityScore' : qualscore.flatten(),
'EventType' : np.broadcast_to('M', (q_total.shape)).flatten(),
'EmpiricalQuality' : (utils.gatk_delta_q(qualscore.flatten(), q_errs.flatten(), q_total.flatten()) + qualscore.flatten()).astype(np.float),
'Observations' : q_total.flatten(),
'Errors' : q_errs.flatten().astype(np.float)
}
qualtable = pd.DataFrame(data = qualdata)
qualtable = qualtable[qualtable.Observations != 0]
#no quantization, but still have to make the quantization table
#TODO: actual quant algo
quantscores = np.arange(94)
qcount = np.zeros(quantscores.shape)
qcount[qualscore[0,]] = np.sum(q_total, axis = 0)
quantized = quantize(q_errs, q_total) #TODO: actually quantize
quantdata = {'QualityScore' : quantscores,
'Count' : qcount,
'QuantizedScore' : quantized
}
quanttable = pd.DataFrame(data = quantdata)
dinuc_q = np.repeat(np.broadcast_to(np.arange(dinuc_total.shape[1]), (dinuc_total.shape[0:2])), dinuc_total.shape[2])
dinuc_to_int = utils.Dinucleotide.dinuc_to_int
covtable_colorder = ['ReadGroup','QualityScore','CovariateName','CovariateValue']
dinucdata = {'ReadGroup' : np.repeat(rg_order, np.prod(dinuc_total.shape[1:])),
'QualityScore' : dinuc_q.flatten(),
'CovariateValue' : np.broadcast_to(np.array(utils.Dinucleotide.dinucs), dinuc_total.shape).flatten(),
'CovariateName' : np.broadcast_to('Context', dinuc_total.shape).flatten(),
'EventType' : np.broadcast_to('M',dinuc_total.shape).flatten(),
'EmpiricalQuality' : (utils.gatk_delta_q(dinuc_q.flatten(), dinuc_errs.flatten(), dinuc_total.flatten()) + dinuc_q.flatten()).astype(np.float),
'Observations' : dinuc_total.flatten(),
'Errors' : dinuc_errs.flatten().astype(np.float)
}
dinuctable = pd.DataFrame(data = dinucdata)
dinuctable = dinuctable[dinuctable.Observations != 0]
cycle_q = np.repeat(np.broadcast_to(np.arange(pos_total.shape[1]), (pos_total.shape[0:2])), pos_total.shape[2])
ncycles = pos_total.shape[2] / 2
cycle_values = np.concatenate([np.arange(ncycles) + 1, np.flip(-(np.arange(ncycles)+1),axis=0)]).astype(np.int)
cycledata = {'ReadGroup' : np.repeat(rg_order, np.prod(pos_total.shape[1:])).flatten(),
'QualityScore' : cycle_q.flatten(),
'CovariateValue' : np.broadcast_to(cycle_values, pos_total.shape).astype(np.unicode).flatten(),
'CovariateName' : np.broadcast_to('Cycle',pos_total.shape).flatten(),
'EventType' : np.broadcast_to('M',pos_total.shape).flatten(),
'EmpiricalQuality' : (utils.gatk_delta_q(cycle_q.flatten(), pos_errs.flatten(), pos_total.flatten()) + cycle_q.flatten()).astype(np.float),
'Observations' : pos_total.flatten(),
'Errors' : pos_errs.flatten().astype(np.float)
}
cycletable = pd.DataFrame(data = cycledata)
covariatetable = dinuctable.append(cycletable)
covariatetable = covariatetable.set_index(covtable_colorder)
covariatetable = covariatetable[covariatetable.Observations != 0]
covariatetable = covariatetable.swaplevel('CovariateValue','CovariateName')
covariatetable = covariatetable.sort_index(level = 0, sort_remaining = True)
covariatetable = covariatetable.reset_index()
#we do this to fix ordering because concatenating the tables ruins it
titles = ['Arguments','Quantized','RecalTable0','RecalTable1','RecalTable2']
descriptions = ['Recalibration argument collection values used in this run',
'Quality quantization map', '' , '' , '']
gatktables = [recaltable.GATKTable(title, desc, table) for title, desc, table in \
zip(titles, descriptions, [argtable, quanttable, rgtable, qualtable, covariatetable])]
return recaltable.RecalibrationReport(gatktables)
[docs]def bam_to_report(bamfileobj, fastafilename, var_pos):
rgs = list(utils.get_rg_to_pu(bamfileobj).values())
*vectors, = bam_to_bqsr_covariates(bamfileobj, fastafilename, var_pos)
return vectors_to_report(*vectors, rgs)