Source code for kbbq.gatk.bqsr

"""
Utilities for emulating GATK's BQSR tool.

BQSR model construction hard clips soft clips and
trims adaptors. ApplyBQSR does not. So we need
different functions for each.
"""

import pysam
import numpy as np
import pandas as pd
import scipy.stats
from .. import compare_reads as utils
from .. import recaltable

###############################

# Covariate Functions

###############################


[docs]def bamread_bqsr_cycle(read):
    fullcycle = np.zeros(read.query_length, dtype = np.int) #full length
    cycle = utils.generic_cycle_covariate(read.query_alignment_length, read.is_read2) #excludes soft-clipped bases!
    #soft-clipped bases will be skipped in other code
    #so it's no problem that some cycles will stay at 0
    if read.is_reverse:
        cycle = np.flip(cycle)
    fullcycle[read.query_alignment_start:read.query_alignment_end] = cycle
    return fullcycle

[docs]def bamread_bqsr_dinuc(read, use_oq = True, minscore = 6):
    unclipped_start = read.query_alignment_start
    unclipped_end = read.query_alignment_end
    seq = read.query_sequence[unclipped_start:unclipped_end]
    quals = (utils.bamread_get_oq(read) if use_oq else np.array(read.query_qualities, dtype = np.int))
    quals = quals[unclipped_start:unclipped_end]
    if read.is_reverse:
        seq = ''.join([utils.Dinucleotide.complement.get(x,'N') for x in reversed(seq)])
        quals = np.flip(quals)
    fulldinuc = np.zeros(read.query_length, dtype = np.int)
    dinuccov = utils.generic_dinuc_covariate(
        np.array(list(seq), dtype = 'U1'),
        quals, minscore).copy()
    if read.is_reverse:
        # flip back to fwd coordinates
        dinuccov = np.flip(dinuccov)
    fulldinuc[unclipped_start:unclipped_end] = dinuccov
    return fulldinuc

[docs]def bam_to_bqsr_covariates(bamfileobj, fastafilename, var_pos, minscore = 6, maxscore = 42):
    """
    Given a BAM file object, FASTA reference file name and var_pos dict,
    get the standard covariate arrays.
    """
    rg_to_pu = utils.get_rg_to_pu(bamfileobj)
    nrgs = len(rg_to_pu.keys())
    rg_to_int = dict(zip(rg_to_pu, range(len(rg_to_pu))))
    fasta = pysam.FastaFile(fastafilename)
    #the below can probably be spun out to a function, i think we only use fullskips
    ref = {chrom : np.array(list(fasta.fetch(reference = chrom)), dtype = np.unicode) for chrom in fasta.references}
    varsites = {chrom : np.array(var_pos[chrom], dtype = np.int) for chrom in var_pos.keys()}
    fullskips = {chrom : np.zeros(len(ref[chrom]), dtype = np.bool) for chrom in ref.keys()}
    for chrom in fullskips.keys():
        variable_positions = varsites[chrom]
        fullskips[chrom][variable_positions] = True

    nreads = np.sum([s.total for s in bamfileobj.get_index_statistics()])
    read = next(bamfileobj)
    seqlen = len(read.query_qualities)

    rgs = np.zeros(seqlen, dtype = np.int_)
    meanq = np.zeros(nrgs, dtype = np.int_)
    expected_errs = np.zeros(nrgs, dtype = np.longdouble)
    rg_errs = np.zeros(nrgs, dtype = np.int_)
    rg_total = np.zeros(nrgs, dtype = np.int_)
    q_errs = np.zeros((nrgs, maxscore + 1), dtype = np.int_)
    q_total = np.zeros((nrgs, maxscore + 1), dtype = np.int_)
    pos_errs = np.zeros((nrgs, maxscore + 1, 2 * seqlen), dtype = np.int_)
    pos_total = np.zeros((nrgs, maxscore + 1, 2 * seqlen), dtype = np.int_)
    dinuc_errs = np.zeros((nrgs, maxscore + 1, 16), dtype = np.int_)
    dinuc_total = np.zeros((nrgs, maxscore + 1, 16), dtype = np.int_)
    
    try:
        while True:
            seq = read.query_sequence
            rgs[:] = rg_to_int[read.get_tag('RG')]
            errors, skips = utils.find_read_errors(read, ref, fullskips)
            q = utils.bamread_get_oq(read)
            pos = bamread_bqsr_cycle(read)
            dinucleotide = bamread_bqsr_dinuc(read)
            seq = np.array(list(seq), dtype = 'U1')
            trimmed = trim_bamread(read)
            
            skips[q < minscore] = True
            skips[trimmed] = True
            skips[seq == 'N'] = True

            valid = np.logical_not(skips)
            dinuc_valid = np.logical_and(dinucleotide != -1, valid)
            e_and_valid = np.logical_and(errors, valid)
            e_and_dvalid = np.logical_and(errors, dinuc_valid)

            rge = rgs[e_and_valid]
            rgv = rgs[valid]
            qe = q[e_and_valid]
            qv = q[valid]

            np.add.at(expected_errs, rgv, utils.q_to_p(qv))
            np.add.at(rg_errs, rge, 1)
            np.add.at(rg_total, rgv, 1)
            np.add.at(q_errs, (rge, qe), 1)
            np.add.at(q_total, (rgv, qv), 1)
            np.add.at(pos_errs, (rge, qe, pos[e_and_valid]), 1)
            np.add.at(pos_total, (rgv, qv, pos[valid]), 1)
            np.add.at(dinuc_errs, (rgs[e_and_dvalid], q[e_and_dvalid], dinucleotide[e_and_dvalid]), 1)
            np.add.at(dinuc_total, (rgs[dinuc_valid], q[dinuc_valid], dinucleotide[dinuc_valid]), 1)
            read = next(bamfileobj)
    except StopIteration:
        pass
    meanq = utils.p_to_q(expected_errs / rg_total)
    return meanq, rg_errs, rg_total, q_errs, q_total, pos_errs, pos_total, dinuc_errs, dinuc_total

#############################################

# Trimming Functions

#############################################

[docs]def bamread_adaptor_boundary(read):
    #https://github.com/broadinstitute/gatk/blob/43b2b3bd4e723552414b32b8b2a7341b81f1f688/src/main/java/org/broadinstitute/hellbender/utils/read/ReadUtils.java#L534
    #0-based in ref coordinates
    if ( read.tlen == 0 or
        not read.is_paired or
        read.is_unmapped or
        read.mate_is_unmapped or
        read.is_reverse == read.mate_is_reverse):
            return None
    if read.is_reverse:
        #next_reference_start is 1-based
        #reference_start is 0-based
        #reference_end is 0-based but points to 1 past the last base
        #   (so essentially it's 1-based)
        if (read.reference_end - 1) > (read.next_reference_start):
            #"well-formed" insert len
            return read.next_reference_start - 1
        else:
            return None
    else:
        if read.reference_start <= (read.next_reference_start + read.tlen):
            #"well-formed" insert len
            return read.reference_start + abs(read.tlen)
        else:
            return None


[docs]def trim_bamread(read):
    #https://github.com/broadinstitute/gatk/blob/b11abd12b7305767ed505a8ff644a63659abf2cd/src/main/java/org/broadinstitute/hellbender/utils/clipping/ReadClipper.java#L388
    #return an array of seqlen which includes bases to skip
    #next_reference_start is 0-based
    #reference_start is 0-based
    #reference_end is 0-based but points to 1 past the last base
    #   (so essentially it's 1-based)
    adaptor_boundary = bamread_adaptor_boundary(read)
    skips = np.zeros(len(read.query_qualities), dtype = np.bool)
    if adaptor_boundary is None:
        return skips
    else:
        if read.is_reverse:
            if adaptor_boundary >= read.reference_start:
                #clip from start (left)
                #we need to get the boundary in read coordinates rather than ref
                boundary_reached = False
                for readidx, refidx in reversed(read.get_aligned_pairs()):
                    if refidx is not None and refidx <= adaptor_boundary:
                        boundary_reached = True
                    if boundary_reached and readidx is not None:
                        adaptoridx = readidx + 1 #slice syntax
                        break
                else:
                    #couldn't find boundary
                    #I think this can only happen if the boundary lies in a
                    #deletion that covers the rest of the read.
                    adaptoridx = 0
                skips[:adaptoridx] = True #skip first x bases
            return skips
        else:
            if adaptor_boundary <= (read.reference_end - 1):
                #clip from end (right)
                #reference_end is 1 past the end
                #reference_end - 1 - adaptor_boundary + 1
                boundary_reached = False
                for readidx, refidx in read.get_aligned_pairs():
                    if refidx is not None and refidx >= adaptor_boundary:
                        boundary_reached = True
                    if boundary_reached and readidx is not None:
                        adaptoridx = readidx
                        break
                else:
                    #couldn't find boundary
                    #I think this can only happen if the boundary lies in a
                    #deletion that covers the rest of the read.
                    adaptoridx = len(skips)
                skips[adaptoridx:] = True #skip last x bases
            return skips

##################################################

# RecalibrationReport Creation Functions

##################################################

[docs]def quantize(q_errs, q_total, maxscore = 93):
    """
    This function doesn't match the GATK version, but
    it's not used so it's not a priority.
    """
    qe = np.sum(q_errs, axis = 0)
    qt = np.sum(q_total, axis = 0)
    unobserved = (qt == 0)
    quantizer = np.arange(maxscore + 1)
    quantizer[0:qt.shape[0]][unobserved] = maxscore
    quantizer[qt.shape[0]:] = maxscore
    return quantizer

[docs]def vectors_to_report(meanq, global_errs, global_total, q_errs, q_total,
    pos_errs, pos_total, dinuc_errs, dinuc_total, rg_order, maxscore = 42):
    """
    Turn the set of recalibration vectors into a
    :class:`kbbq.recaltable.RecalibrationReport` object.

    For the recalibration vectors, each dimension corresponds to a covariate.
    The first index is always the read group, and the second (if it exists)
    represents the raw quality score, the final index is either the cycle or
    dinucleotide covariate.

    :param meanq: Mean q for each read group
    :type meanq: :class:`numpy.ndarray` [:]
    :param global_errs: Number of errors for each read group
    :type global_errs: :class:`numpy.ndarray` [:]
    :param global_total: Number of observations for each read group
    :type global_total: :class:`numpy.ndarray` [:]
    :param q_errs: Number of errors for each read group and q score subset.
    :type q_errs: :class:`numpy.ndarray` [:,:]
    :param q_total: Number of observations for each read group and q score subset.
    :type q_total: :class:`numpy.ndarray` [:,:]
    :param pos_errs: Number of errors for each read group, q, and cycle subset.
    :type pos_errs: :class:`numpy.ndarray` [:,:,:]
    :param pos_total: Number of observations for each read group, q, and cycle subset.
    :type pos_total: :class:`numpy.ndarray` [:,:,:]
    :param dinuc_errs: Number of errors for each read group, q, and dinucleotide subset.
    :type dinuc_errs: :class:`numpy.ndarray` [:,:,:]
    :param dinuc_total: Number of observations for each read group, q, and dinucleotide
        subset.
    :type dinuc_total: :class:`numpy.ndarray` [:,:,:]
    :param rg_order: The order of read groups
    :type rg_order: list(str)
    :param int maxscore: The maximum possible quality score
    :return: the recalibration table
    :rtype: :class:`kbbq.recaltable.RecalibrationReport`
    """

    #these will be mostly default values, except quantization
    #which I don't attempt to implement.
    #I'm afraid bad things will happen if I don't include at least null values
    #for all the args so I'll just include them all.
    #This may need to be cleaned up later.

    args = {
        'binary_tag_name' : 'null',
        'covariate' : 'ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate',
        'default_platform' : 'null',
        'deletions_default_quality' : '45',
        'force_platform' : 'null',
        'indels_context_size' : '3',
        'insertions_default_quality' : '45',
        'low_quality_tail' : '2',
        'maximum_cycle_value' : '500',
        'mismatches_context_size' : '2',
        'mismatches_default_quality' : '-1',
        'no_standard_covs' : 'false',
        'quantizing_levels' : '16',
        'recalibration_report' : 'null',
        'run_without_dbsnp' : 'false',
        'solid_nocall_strategy' : 'THROW_EXCEPTION',
        'solid_recal_mode' : 'SET_Q_ZERO'
        }
    argdata = {'Argument' : list(args.keys()),
    'Value' : list(args.values())
    }
    argtable = pd.DataFrame(data = argdata)

    rg_est_q = -10.0 * np.log10(np.sum(utils.q_to_p(np.arange(q_total.shape[1])) * q_total, axis = 1) / global_total).round(decimals = 5).astype(np.float)
    rg_est_q[np.isnan(rg_est_q)] = 0
    rgdata = {'ReadGroup' : rg_order,
        'EventType' : 'M',
        'EmpiricalQuality' : (utils.gatk_delta_q(rg_est_q, global_errs.copy(), global_total.copy()) + rg_est_q).astype(np.float),
        'EstimatedQReported' : rg_est_q,
        'Observations' : global_total,
        'Errors' : global_errs.astype(np.float)
        }
    rgtable = pd.DataFrame(data = rgdata)
    rgtable = rgtable[rgtable.Observations != 0]

    qualscore = np.broadcast_to(np.arange(q_total.shape[1]), (q_total.shape)).copy()
    qualdata = {'ReadGroup' : np.repeat(rg_order, q_total.shape[1]),
        'QualityScore' : qualscore.flatten(),
        'EventType' : np.broadcast_to('M', (q_total.shape)).flatten(),
        'EmpiricalQuality' : (utils.gatk_delta_q(qualscore.flatten(), q_errs.flatten(), q_total.flatten()) + qualscore.flatten()).astype(np.float),
        'Observations' : q_total.flatten(),
        'Errors' : q_errs.flatten().astype(np.float)
        }
    qualtable = pd.DataFrame(data = qualdata)
    qualtable = qualtable[qualtable.Observations != 0]

    #no quantization, but still have to make the quantization table
    #TODO: actual quant algo
    quantscores = np.arange(94)
    qcount = np.zeros(quantscores.shape)
    qcount[qualscore[0,]] = np.sum(q_total, axis = 0)
    quantized = quantize(q_errs, q_total) #TODO: actually quantize
    quantdata = {'QualityScore' : quantscores,
        'Count' : qcount,
        'QuantizedScore' : quantized
        }
    quanttable = pd.DataFrame(data = quantdata)

    dinuc_q = np.repeat(np.broadcast_to(np.arange(dinuc_total.shape[1]), (dinuc_total.shape[0:2])), dinuc_total.shape[2])
    dinuc_to_int = utils.Dinucleotide.dinuc_to_int
    covtable_colorder = ['ReadGroup','QualityScore','CovariateName','CovariateValue']
    dinucdata = {'ReadGroup' : np.repeat(rg_order, np.prod(dinuc_total.shape[1:])),
        'QualityScore' : dinuc_q.flatten(),
        'CovariateValue' : np.broadcast_to(np.array(utils.Dinucleotide.dinucs), dinuc_total.shape).flatten(),
        'CovariateName' : np.broadcast_to('Context', dinuc_total.shape).flatten(),
        'EventType' : np.broadcast_to('M',dinuc_total.shape).flatten(),
        'EmpiricalQuality' : (utils.gatk_delta_q(dinuc_q.flatten(), dinuc_errs.flatten(), dinuc_total.flatten()) + dinuc_q.flatten()).astype(np.float),
        'Observations' : dinuc_total.flatten(),
        'Errors' : dinuc_errs.flatten().astype(np.float)
        }
    dinuctable = pd.DataFrame(data = dinucdata)
    dinuctable = dinuctable[dinuctable.Observations != 0]

    cycle_q = np.repeat(np.broadcast_to(np.arange(pos_total.shape[1]), (pos_total.shape[0:2])), pos_total.shape[2])
    ncycles = pos_total.shape[2] / 2
    cycle_values = np.concatenate([np.arange(ncycles) + 1, np.flip(-(np.arange(ncycles)+1),axis=0)]).astype(np.int)
    cycledata = {'ReadGroup' : np.repeat(rg_order, np.prod(pos_total.shape[1:])).flatten(),
        'QualityScore' : cycle_q.flatten(),
        'CovariateValue' : np.broadcast_to(cycle_values, pos_total.shape).astype(np.unicode).flatten(),
        'CovariateName' : np.broadcast_to('Cycle',pos_total.shape).flatten(),
        'EventType' : np.broadcast_to('M',pos_total.shape).flatten(),
        'EmpiricalQuality' : (utils.gatk_delta_q(cycle_q.flatten(), pos_errs.flatten(), pos_total.flatten()) + cycle_q.flatten()).astype(np.float),
        'Observations' : pos_total.flatten(),
        'Errors' : pos_errs.flatten().astype(np.float)
        }
    cycletable = pd.DataFrame(data = cycledata)
    covariatetable = dinuctable.append(cycletable)
    covariatetable = covariatetable.set_index(covtable_colorder)
    covariatetable = covariatetable[covariatetable.Observations != 0]
    covariatetable = covariatetable.swaplevel('CovariateValue','CovariateName')
    covariatetable = covariatetable.sort_index(level = 0, sort_remaining = True)
    covariatetable = covariatetable.reset_index()
    #we do this to fix ordering because concatenating the tables ruins it

    titles = ['Arguments','Quantized','RecalTable0','RecalTable1','RecalTable2']
    descriptions = ['Recalibration argument collection values used in this run',
        'Quality quantization map', '' , '' , '']
    gatktables = [recaltable.GATKTable(title, desc, table) for title, desc, table in \
        zip(titles, descriptions, [argtable, quanttable, rgtable, qualtable, covariatetable])]

    return recaltable.RecalibrationReport(gatktables)

[docs]def bam_to_report(bamfileobj, fastafilename, var_pos):
    rgs = list(utils.get_rg_to_pu(bamfileobj).values())
    *vectors, = bam_to_bqsr_covariates(bamfileobj, fastafilename, var_pos)
    return vectors_to_report(*vectors, rgs)