#!usr/bin/env python3
"""
Provides utilities for interfacing with GATK Recalibration tables.
"""
import numpy as np
import pandas as pd
[docs]class GATKReport:
"""
A base class representing a GATK Report file that contains many tables.
Attributes
* :attr:`tables` - a list of GATKTable objects
* :attr:`version` - the GATKReport format version. The default is
``'1.1'``.
Methods
* :meth:`write` - write the report to a file
* :meth:`fromfile` - read the report from a file
* :meth:`get_headerstring` - get the header line.
Does not include a newline.
"""
[docs] def __init__(self, tables, version = '1.1'):
"""
Initialize a GATKReport.
This module was written by reverse engineering a version 1.1 report.
If you have a table with a newer version, please file a bug report,
especially if this module fails to parse it.
:param list(GATKTable) tables: a list of tables.
:param str version: The report version as a string.
:returns: a representation of a report
:rtype: :class:`.GATKReport`
"""
self.tables = tables
"""A list of :class:`.GATKTable` objects."""
self.version = version
"""The report version identifier (such as ``'1.1'``)"""
[docs] @classmethod
def fromfile(cls, filename):
"""
Initialize the GATKReport from a file.
:param str filename: The file to be read.
:returns: a representation of a report
:rtype: :class:`.GATKReport`
"""
with open(filename) as fh:
fullheader = fh.readline()
_, version, ntables = fullheader.strip().split(':')
version = version.split(sep = 'v', maxsplit = 1)[-1]
table_strings = fh.read().split('\n\n')
tables = [GATKTable.fromstring(s) for s in table_strings if s != '']
if (len(tables) != int(ntables)):
raise ValueError(f"Malformed or truncated file {filename}.\
The header ({fullheader}) implies there should be\
{ntables} tables in this report, but we only found\
{len(tables)}. If you are sure the file is intact,\
modify the header line and try again.")
return cls(tables, version)
[docs] def write(self, filename):
"""
Write the report to filename.
:param str filename: The file name to write.
"""
with open(filename, 'w') as fh:
fh.write(str(self))
[docs] def __str__(self):
"""
A string containing the report.
Each table is separated by two newlines.
"""
return self.get_headerstring() + '\n' + \
'\n\n'.join([str(t) for t in self.tables] + [''])
def __repr__(self):
return self.get_headerstring() + '\n' + \
'\n'.join([repr(t) for t in self.tables]) + '\n'
def __eq__(self, other):
"""
Test whether two reports are equal. Note that the order of tables
is significant, and this function will return false if the tables
are not in the same order.
"""
if type(other) is type(self):
if self.version != other.version:
return False
elif len(self.tables) != len(other.tables):
return False
else:
for s, o in zip(self.tables, other.tables):
if not s == o:
return False
else:
return True
else:
return NotImplemented
[docs]class GATKTable:
"""
A class representing a GATK report table.
The intended method of interacting with the table is using the data
attribute, which is a standard Pandas dataframe.
Attributes
* :attr:`title` - The title of the table
* :attr:`description` - A description of the table. May be ``''``.
* :attr:`data` - A Pandas dataframe containing the table data. Accessing this
attribute is the primary way to interact with the data.
* :attr:`typemap` - A dict mapping types to a character to be used
for constructing the format string.
* :attr:`precisionmap` - A dict mapping column headers to precision
to be used for constructing the format string.
Methods
* :meth:`fromstring` - Initialize a :class:`GATKTable` from a string.
* :meth:`parse_fmtstring` - Parse a fmtstring into a dictionary of types
* :meth:`get_fmtstring` - Get a formatstring by inspecting the dataframe.
* :meth:`get_unindexed` - Get a copy of the dataframe with indices reset
* :meth:`get_colfmts` - Get a list of format strings for the data in
the dataframe
* :meth:`get_titlestring` - Get the title string
* :meth:`get_datastring` - Format the dataframe into a string
* :meth:`get_nrows` - Get the number of rows in the dataframe.
* :meth:`get_ncols` - Get the number of columns in the dataframe.
* :meth:`write` - Write the table to a filehandle.
"""
[docs] def __init__(self, title, description, data):
"""
Initialize the table.
:param str title: the table title
:param str description: a short description of the table. May be ``''``
:param pandas.DataFrame data: the data contained by the table
"""
self.title = title
"""The title of the table."""
self.description = description
"""A short description of the table."""
self.data = data
"""
A Pandas dataframe containing the table data. Accessing this
attribute is the primary way to interact with the data.
"""
self.typemap = {np.dtype(np.int) : 'd', np.dtype(np.float) : 'f', str : 's', np.dtype(np.object) : 's'}
"""
A dictionary of `{type : str}` to determine the string for each type.
Note that in the current implementation this only affects
:meth:`get_fmtstring`.
"""
self.precisionmap = {'EmpiricalQuality' : '.4', 'EstimatedQReported' : '.4', 'Errors' : '.2'}
"""
A dictionary of `{colname : str}` to determine the precision string to prepend
to each formatstring. This only affects :meth:`get_fmtstring`.
"""
[docs] @classmethod
def fromstring(cls, tablestring):
"""
Initialize the table from a string
The first line of the table contains the a string describing the table
format and is like ``#:GATKTable:ncol:nrow:%f:%f:%f:;``. It is ``:``
delimited and ``;`` terminated. The last format string will be empty.
The second line of the table specifies the name of the table. It is
formatted like ``#:GATKTable:title:description``. :attr:`description`
can be an empty string. It is ``:`` delimited. The third line contains
the header. The header and row data are delimited by two or more spaces.
The number of spaces are determined by padding each value in the row
such that the the character width of the column is constant and can
represent all the data in the column. Strings (including column
headings) are left justified and numeric types are right justified.
"""
rows = tablestring.splitlines()
title, description = rows[1].split(':')[2:4]
header = rows[2].split()
typedict = cls.parse_fmtstring(header, rows[0])
strdata = [s.split() for s in rows[3:]]
if strdata != []:
d = dict(zip(header, zip(*strdata))) #dictionary {colname : coldata}
else:
d = dict(zip(header,[[]] * len(header)))
data = pd.DataFrame(d).astype(typedict)
return cls(title, description, data)
[docs] @staticmethod
def parse_fmtstring(header, fmtstring):
"""
Turn a fmtstring into a dictionary of `{col_title : type}`
:param list(str) header: The column titles as a list
:param str fmtstring: The format string to parse
:return: A dictionary mapping `{col_title : type}`
:rtype: dict(:class:`str`, :obj:`type`)
"""
splitfmt = fmtstring.split(':')
fmtstrings = splitfmt[4:-1]
typedict = {}
for i,h in enumerate(header):
f = fmtstrings[i]
if f.endswith('d'):
t = np.int64
elif f.endswith('f'):
t = np.float64
elif f.endswith('s'):
t = str #pandas will convert this to 'object'
else:
t = None
if t is not None:
typedict[h] = t
return typedict
[docs] def get_fmtstring(self):
"""
Get the format string by inspecting the dataframe.
Uses the dtypes, number of rows, and number of columns in the data
frame along with the strings returned by :func:`get_colfmts` to
create the format string
which is formatted like ``#:GATKTable:ncol:nrow:%f:%f:%f:;``.
:return: The format string
:rtype: str
"""
fmtlist = ['#', 'GATKTable', str(self.get_ncols()), str(self.get_nrows())]
fmtlist = fmtlist + self.get_colfmts()
# for f in self.get_colfmts():
# fmtlist.append(f)
fmtlist.append(';')
return ':'.join(fmtlist)
[docs] def get_unindexed(self):
"""
Return a copy of the dataframe without the indexes included.
If the table has an unnamed index, such as the default index,
return a copy of the data frame. Otherwise, reset the index.
"""
if not self.data.index.names == [None]:
#only reset if we have a meaningful column name for this index
return self.data.reset_index()
else:
return self.data.copy()
[docs] def get_colfmts(self):
"""
Get the formats for each column in the dataframe.
Uses :attr:`typemap` and :attr:`precisionmap` to
create the strings. This will return something like
``['%s', '%s', '%.2f', '%.4f']``
:return: format strings for each column
:rtype: list(str)
"""
fmtlist = []
unindexed = self.get_unindexed()
types = unindexed.dtypes
heads = unindexed.columns.to_list()
for t,h in zip(types, heads):
colfmtstr = '%' + self.precisionmap.get(h,'') + self.typemap[t]
fmtlist.append(colfmtstr)
return fmtlist
[docs] def get_titlestring(self):
"""
Get the titlestring.
Uses the :attr:`title` and :attr:`description` to create the title
line. It is formatted like ``#:GATKTable:title:subtitle.``
:return: the title string
:rtype: str
"""
titlelist = ['#', 'GATKTable', self.title, self.description]
return ':'.join(titlelist)
[docs] def get_datastring(self):
"""
Get the data string. Includes the header line and the data in
the table.
:return: the data table as a formatted string
:rtype: str
"""
fmtstrings = self.get_colfmts()
unindexed = self.get_unindexed()
header = unindexed.columns.to_list()
fmtfuncs = { h : lambda x: f % x for h,f in zip(header, fmtstrings)}
formatted = unindexed.copy()
for i, (h, items) in enumerate(unindexed.items()):
if fmtstrings[i] != '%s':
formatted[h] = items.apply(fmtfuncs[h])
datawidths = formatted.apply(lambda x: x.str.len().max()).to_numpy()
headwidths = np.array([len(x) for x in header])
if len(datawidths) != 0:
colwidths = np.maximum(datawidths, headwidths)
else:
colwidths = headwidths
datastr = ' '.join([h.ljust(colwidths[i]) for i,h in enumerate(header)])
for row in unindexed.itertuples(index = False):
line = [getattr(row,header[i]).ljust(colwidths[i]) if \
f == '%s' else \
(f % float(getattr(row,header[i]))).rjust(colwidths[i]) \
for i,f in enumerate(fmtstrings)]
datastr = datastr + '\n' + ' '.join(line)
return datastr
[docs] def get_nrows(self):
"""
Get the number of rows in the dataframe.
:return: The number of rows in the dataframe.
:rtype: int
"""
return self.get_unindexed().shape[0]
[docs] def get_ncols(self):
"""
Get the number of columns in the dataframe.
:return: The number of columns in the dataframe.
:rtype: int
"""
return self.get_unindexed().shape[1]
[docs] def write(self, filehandle):
"""
Write the table to a filehandle.
Adds a newline to the end that isn't present in :meth:`__str__`
:param filehandle: The file object to write the table to.
:type filehandle: File Object
:return: Whatever the file object's write method returns.
Usually the number of characters written.
:rtype: Usually int
"""
return filehandle.write(str(self) + '\n')
[docs] def __str__(self):
return self.get_fmtstring() + '\n' + self.get_titlestring() + '\n' + self.get_datastring()
def __repr__(self):
return self.get_fmtstring() + '\n' + self.get_titlestring() + '\n' + repr(self.data)
def __eq__(self, other):
"""
Note: the pandas df.data.equals function is VERY unforgiving.
"""
if type(other) is type(self):
return self.title == other.title and \
self.description == other.description and \
self.data.equals(other.data)
else:
return NotImplemented
[docs]class RecalibrationReport(GATKReport):
"""
A class representing a GATK Recalibration Report used for Base Quality
Score Recalibration.
It extends the :class:`GATKReport`, so the primary
method of interacting with the table is the :attr:`tables` attribute.
The primary difference is the :func:`__init__` function will set indices
and data types as they should be for a recalibration report.
* Table :attr:`tables[0]` contains the arguments used to call the program
that generated the table.
* Table :attr:`tables[1]` contains the quanization map
* Table :attr:`tables[2]` contains read group error information.
* Table :attr:`tables[3]` contains error information subset by read group,
then reported quality score.
* Table :attr:`tables[4]` contains error information subset by read group,
then reported quality score, then covariate name and value. Both
Context and Cycle are valid for the covariate name.
See the table below for a schema.
===== ================= ===========================================
Table Number of Columns Variables
===== ================= ===========================================
0 2 Argument, Value
1 3 QualityScore, Count, QuantizedScore
2 6 ReadGroup, EventType, EmpiricalQuality,
EstimatedQReported, Observations, Errors
3 6 ReadGroup, QualityScore, EventType,
EmpiricalQuality, Observations, Errors
4 8 ReadGroup, QualityScore, CovariateValue,
CovariateName, EventType, EmpiricalQuality,
Observations, Errors
===== ================= ===========================================
For current GATK parameters, EventType will be ``M`` for mismatch though
``I`` for insertion and ``D`` for deletion are possible. Valid values for
CovariateName are ``Context`` and ``Cycle``.
"""
[docs] def __init__(self, tables, version = '1.1'):
"""
Initialize the recalibration report from a list of :class:`GATKTable` s.
This initializes the tables with some data wrangling to set indices
and data types properly. If you don't want this, use a
:class:`.GATKReport` instead.
"""
super().__init__(tables, version)
if len(self.tables) != 5:
raise ValueError(f"""A RecalibrationReport should have 5 tables.
This report contains {len(self.tables)}.""")
#self.data[0] is argument / value
assert self.tables[0].title == 'Arguments'
#self.data[1] is quantization map
assert self.tables[1].title == 'Quantized'
#self.data[2] is RG
assert self.tables[2].title == 'RecalTable0'
#self.data[3] is RG / reportedqual
assert self.tables[3].title == 'RecalTable1'
#self.data[4] is RG / reportedqual / covariate (Cycle OR Context)
assert self.tables[4].title == 'RecalTable2'
self.tables[0].data = self.tables[0].data.set_index('Argument')
self.tables[1].data = self.tables[1].data.astype({'QualityScore' : np.int, 'Count' : np.longlong, 'QuantizedScore' : np.int })
self.tables[1].data = self.tables[1].data.set_index('QualityScore')
self.tables[2].data = self.tables[2].data.astype({'Observations' : np.int})
self.tables[2].data = self.tables[2].data.set_index('ReadGroup')
#typer.pop('EstimatedQReported')
#typer.update({'QualityScore' : np.int_})
typer = {'ReadGroup' : str, 'QualityScore' : np.int}
self.tables[3].data = self.tables[3].data.astype(typer)
self.tables[3].data = self.tables[3].data.set_index(['ReadGroup','QualityScore'])
typer.update({'CovariateName': str, 'CovariateValue' : str})
self.tables[4].data = self.tables[4].data.astype(typer)
self.tables[4].data = self.tables[4].data.set_index(['ReadGroup','QualityScore','CovariateName','CovariateValue'])
[docs] def __str__(self):
"""
GATKReports list CovariateValue before CovariateName, which
doesn't make much sense for organizing the index. So we switch
the columns before printing and switch them back after we get
the string.
"""
self.tables[4].data = self.tables[4].data.swaplevel('CovariateValue','CovariateName')
selfstr = super().__str__()
self.tables[4].data = self.tables[4].data.swaplevel('CovariateValue','CovariateName')
return selfstr