Spaces:
No application file
No application file
| #!/usr/bin/env python | |
| # Copyright 2004-2005 by Michael Hoffman. All rights reserved. | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Run and process output from the Wise2 package tool dnal. | |
| Bio.Wise contains modules for running and processing the output of | |
| some of the models in the Wise2 package by Ewan Birney available from: | |
| ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/ | |
| http://www.ebi.ac.uk/Wise2/ | |
| Bio.Wise.psw is for protein Smith-Waterman alignments | |
| Bio.Wise.dnal is for Smith-Waterman DNA alignments | |
| """ | |
| import re | |
| # Importing with leading underscore as not intended to be exposed | |
| from subprocess import getoutput as _getoutput | |
| from Bio import Wise | |
| _SCORE_MATCH = 4 | |
| _SCORE_MISMATCH = -1 | |
| _SCORE_GAP_START = -5 | |
| _SCORE_GAP_EXTENSION = -1 | |
| _CMDLINE_DNAL = ["dnal", "-alb", "-nopretty"] | |
| def _build_dnal_cmdline(match, mismatch, gap, extension): | |
| res = _CMDLINE_DNAL[:] | |
| res.extend(["-match", str(match)]) | |
| res.extend(["-mis", str(mismatch)]) | |
| res.extend(["-gap", str(-gap)]) # negative: convert score to penalty | |
| res.extend(["-ext", str(-extension)]) # negative: convert score to penalty | |
| return res | |
| _CMDLINE_FGREP_COUNT = "fgrep -c '%s' %s" | |
| def _fgrep_count(pattern, file): | |
| return int(_getoutput(_CMDLINE_FGREP_COUNT % (pattern, file))) | |
| _re_alb_line2coords = re.compile(r"^\[([^:]+):[^\[]+\[([^:]+):") | |
| def _alb_line2coords(line): | |
| return tuple( | |
| int(coord) + 1 # one-based -> zero-based | |
| for coord in _re_alb_line2coords.match(line).groups() | |
| ) | |
| def _get_coords(filename): | |
| alb = open(filename) | |
| start_line = None | |
| end_line = None | |
| for line in alb: | |
| if line.startswith("["): | |
| if not start_line: | |
| start_line = line # rstrip not needed | |
| else: | |
| end_line = line | |
| if end_line is None: # sequence is too short | |
| return [(0, 0), (0, 0)] | |
| return list( | |
| zip(*map(_alb_line2coords, [start_line, end_line])) | |
| ) # returns [(start0, end0), (start1, end1)] | |
| class Statistics: | |
| """Calculate statistics from an ALB report.""" | |
| def __init__(self, filename, match, mismatch, gap, extension): | |
| """Initialize the class.""" | |
| self.matches = _fgrep_count(f'"SEQUENCE" {match}', filename) | |
| self.mismatches = _fgrep_count(f'"SEQUENCE" {mismatch}', filename) | |
| self.gaps = _fgrep_count(f'"INSERT" {gap}', filename) | |
| if gap == extension: | |
| self.extensions = 0 | |
| else: | |
| self.extensions = _fgrep_count(f'"INSERT" {extension}', filename) | |
| self.score = ( | |
| match * self.matches | |
| + mismatch * self.mismatches | |
| + gap * self.gaps | |
| + extension * self.extensions | |
| ) | |
| if self.matches or self.mismatches or self.gaps or self.extensions: | |
| self.coords = _get_coords(filename) | |
| else: | |
| self.coords = [(0, 0), (0, 0)] | |
| def identity_fraction(self): | |
| """Calculate the fraction of matches.""" | |
| return self.matches / (self.matches + self.mismatches) | |
| header = "identity_fraction\tmatches\tmismatches\tgaps\textensions" | |
| def __str__(self): | |
| """Statistics as a tab separated string.""" | |
| return "\t".join( | |
| str(x) | |
| for x in ( | |
| self.identity_fraction(), | |
| self.matches, | |
| self.mismatches, | |
| self.gaps, | |
| self.extensions, | |
| ) | |
| ) | |
| def align( | |
| pair, | |
| match=_SCORE_MATCH, | |
| mismatch=_SCORE_MISMATCH, | |
| gap=_SCORE_GAP_START, | |
| extension=_SCORE_GAP_EXTENSION, | |
| **keywds, | |
| ): | |
| """Align a pair of DNA files using dnal and calculate the statistics of the alignment.""" | |
| cmdline = _build_dnal_cmdline(match, mismatch, gap, extension) | |
| temp_file = Wise.align(cmdline, pair, **keywds) | |
| try: | |
| return Statistics(temp_file.name, match, mismatch, gap, extension) | |
| except AttributeError: | |
| try: | |
| keywds["dry_run"] | |
| return None | |
| except KeyError: | |
| raise | |
| def main(): | |
| """Command line implementation.""" | |
| import sys | |
| stats = align(sys.argv[1:3]) | |
| print( | |
| "\n".join( | |
| f"{attr}: {getattr(stats, attr)}" | |
| for attr in ("matches", "mismatches", "gaps", "extensions") | |
| ) | |
| ) | |
| print(f"identity_fraction: {stats.identity_fraction()}") | |
| print(f"coords: {stats.coords}") | |
| def _test(*args, **keywds): | |
| import doctest | |
| import sys | |
| doctest.testmod(sys.modules[__name__], *args, **keywds) | |
| if __name__ == "__main__": | |
| if __debug__: | |
| _test() | |
| main() | |