Spaces:
No application file
No application file
| # Copyright 2019, National Marrow Donor Program (NMPD). All rights reserved. | |
| # Written by Peter Cock, The James Hutton Institute, under contract to NMDP. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.AlignIO support for GCG MSF format. | |
| The file format was produced by the GCG PileUp and and LocalPileUp tools, | |
| and later tools such as T-COFFEE and MUSCLE support it as an optional | |
| output format. | |
| The original GCG tool would write gaps at ends of each sequence which could | |
| be missing data as tildes (``~``), whereas internal gaps were periods (``.``) | |
| instead. This parser replaces both with minus signs (``-``) for consistency | |
| with the rest of ``Bio.AlignIO``. | |
| You are expected to use this module via the Bio.AlignIO functions (or the | |
| Bio.SeqIO functions if you want to work directly with the gapped sequences). | |
| """ | |
| from Bio.Align import MultipleSeqAlignment | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from .Interfaces import AlignmentIterator | |
| class MsfIterator(AlignmentIterator): | |
| """GCG MSF alignment iterator.""" | |
| _header = None # for caching lines between __next__ calls | |
| def __next__(self): | |
| """Parse the next alignment from the handle.""" | |
| handle = self.handle | |
| if self._header is None: | |
| line = handle.readline() | |
| else: | |
| # Header we saved from when we were parsing | |
| # the previous alignment. | |
| line = self._header | |
| self._header = None | |
| if not line: | |
| raise StopIteration | |
| # Whitelisted headers we know about. | |
| known_headers = ["!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"] | |
| # Examples in "Molecular Biology Software Training Manual GCG version 10" | |
| # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001 | |
| # would often start as follows: | |
| # | |
| # !!AA_MUTIPLE_ALIGNMENT 1.0 | |
| # PileUp of: @/usr/users2/culhane/... | |
| # | |
| # etc with other seemingly free format text before getting to the | |
| # MSF/Type/Check line and the following Name: lines block and // line. | |
| # | |
| # MUSCLE just has a line "PileUp", while other sources just use the line | |
| # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT" | |
| # (nucleotide). | |
| if line.strip().split()[0] not in known_headers: | |
| raise ValueError( | |
| "%s is not a known GCG MSF header: %s" | |
| % (line.strip().split()[0], ", ".join(known_headers)) | |
| ) | |
| while line and " MSF: " not in line: | |
| line = handle.readline() | |
| if not line: | |
| raise ValueError("Reached end of file without MSF/Type/Check header line") | |
| # Quoting from "Molecular Biology Software Training Manual GCG version 10" | |
| # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001. | |
| # Page 31: | |
| # | |
| # "Header information is before a .. (double dot) in a GCG format file. | |
| # The file will also have a checksum specific for that file." | |
| # | |
| # This was followed by a single non-aligned sequence, but this convention | |
| # appears to also be used in the GCG MSF files. Quoting other examples in | |
| # this reference, page 31: | |
| # | |
| # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 .. | |
| # | |
| # Except from page 148: | |
| # | |
| # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 .. | |
| # | |
| # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum: | |
| # | |
| # MSF: 689 Type: N Check: 0000 .. | |
| # | |
| # By observation, the MSF value is the column count, type is N (nucleotide) | |
| # or P (protein / amino acid). | |
| # | |
| # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown, | |
| # | |
| # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf | |
| # !!NA_MULTIPLE_ALIGNMENT 1.0 | |
| # | |
| # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 .. | |
| # | |
| # Name: G26680 Len: 633 Check: 4334 Weight: 1.00 | |
| # Name: G26685 Len: 633 Check: 3818 Weight: 1.00 | |
| # Name: G29385 Len: 633 Check: 391 Weight: 1.00 | |
| # | |
| # // | |
| # | |
| parts = line.strip("\n").split() | |
| offset = parts.index("MSF:") | |
| if ( | |
| parts[offset + 2] != "Type:" | |
| or parts[-3] not in ("Check:", "CompCheck:") | |
| or parts[-1] != ".." | |
| ): | |
| raise ValueError( | |
| "GCG MSF header line should be " | |
| "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', " | |
| " not: %r" % line | |
| ) | |
| try: | |
| aln_length = int(parts[offset + 1]) | |
| except ValueError: | |
| aln_length = -1 | |
| if aln_length < 0: | |
| raise ValueError( | |
| "GCG MSF header line should have MDF: <int> for column count, not %r" | |
| % parts[offset + 1] | |
| ) | |
| seq_type = parts[offset + 3] | |
| if seq_type not in ["P", "N"]: | |
| raise ValueError( | |
| "GCG MSF header line should have 'Type: P' (protein) " | |
| "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type | |
| ) | |
| # There should be a blank line after that header line, then the Name: lines | |
| # | |
| # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here, | |
| # | |
| # PileUp | |
| # | |
| # | |
| # | |
| # MSF: 628 Type: P Check: 147 .. | |
| # | |
| # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000 | |
| # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000 | |
| # | |
| # // | |
| ids = [] | |
| lengths = [] | |
| checks = [] | |
| weights = [] | |
| line = handle.readline() | |
| while line and line.strip() != "//": | |
| line = handle.readline() | |
| if line.strip().startswith("Name: "): | |
| if " Len: " in line and " Check: " in line and " Weight: " in line: | |
| rest = line[line.index("Name: ") + 6 :].strip() | |
| name, rest = rest.split(" Len: ") | |
| length, rest = rest.split(" Check: ") | |
| check, weight = rest.split(" Weight: ") | |
| name = name.strip() | |
| if name.endswith(" oo"): | |
| # T-COFFEE oddity, ignore this | |
| name = name[:-3] | |
| if name in ids: | |
| raise ValueError(f"Duplicated ID of {name!r}") | |
| if " " in name: | |
| raise NotImplementedError(f"Space in ID {name!r}") | |
| ids.append(name) | |
| # Expect aln_length <= int(length.strip()), see below | |
| lengths.append(int(length.strip())) | |
| checks.append(int(check.strip())) | |
| weights.append(float(weight.strip())) | |
| else: | |
| raise ValueError(f"Malformed GCG MSF name line: {line!r}") | |
| if not line: | |
| raise ValueError("End of file while looking for end of header // line.") | |
| if aln_length != max(lengths): | |
| # In broken examples from IMGTHLA was possible to continue | |
| # https://github.com/ANHIG/IMGTHLA/issues/201 | |
| max_length = max(lengths) | |
| max_count = sum(1 for _ in lengths if _ == max_length) | |
| raise ValueError( | |
| "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s" | |
| % (aln_length, max_count, len(ids), max_length) | |
| ) | |
| line = handle.readline() | |
| if not line: | |
| raise ValueError("End of file after // line, expected sequences.") | |
| if line.strip(): | |
| raise ValueError("After // line, expected blank line before sequences.") | |
| # Now load the sequences | |
| seqs = [[] for _ in ids] # list of empty lists | |
| completed_length = 0 | |
| while completed_length < aln_length: | |
| # Note might have a coordinate header line (seems to be optional) | |
| for idx, name in enumerate(ids): | |
| line = handle.readline() | |
| if idx == 0 and not line.strip(): | |
| # T-COFFEE uses two blank lines between blocks, rather than one | |
| while line and not line.strip(): | |
| line = handle.readline() | |
| if not line: | |
| raise ValueError("End of file where expecting sequence data.") | |
| # print("Looking for seq for %s in line: %r" % (name, line)) | |
| words = line.strip().split() | |
| # Should we use column numbers, rather than assuming no spaces in names? | |
| if idx == 0 and words and words[0] != name: | |
| # print("Actually have a coord line") | |
| # Hopefully this is a coordinate header before the first seq | |
| try: | |
| i = int(words[0]) | |
| except ValueError: | |
| i = -1 | |
| if i != completed_length + 1: | |
| raise ValueError( | |
| "Expected GCG MSF coordinate line starting %i, got: %r" | |
| % (completed_length + 1, line) | |
| ) | |
| if len(words) > 1: | |
| # Final block usually not full 50 chars, so expect start only. | |
| if len(words) != 2: | |
| i = -1 | |
| else: | |
| try: | |
| i = int(words[1]) | |
| except ValueError: | |
| i = -1 | |
| if i != ( | |
| completed_length + 50 | |
| if completed_length + 50 < aln_length | |
| else aln_length | |
| ): | |
| raise ValueError( | |
| "Expected GCG MSF coordinate line %i to %i, got: %r" | |
| % ( | |
| completed_length + 1, | |
| completed_length + 50 | |
| if completed_length + 50 < aln_length | |
| else aln_length, | |
| line, | |
| ) | |
| ) | |
| line = handle.readline() | |
| words = line.strip().split() | |
| # print("Still looking for seq for %s in line: %r" % (name, line)) | |
| # Dealt with any coordinate header line, should now be sequence | |
| if not words: | |
| # Should be sequence here, but perhaps its a short one? | |
| if ( | |
| lengths[idx] < aln_length | |
| and len("".join(seqs[idx])) == lengths[idx] | |
| ): | |
| # Is this actually allowed in the format? Personally I would | |
| # expect a line with name and a block of trailing ~ here. | |
| pass | |
| else: | |
| raise ValueError(f"Expected sequence for {name}, got: {line!r}") | |
| elif words[0] == name: | |
| assert len(words) > 1, line | |
| # print(i, name, repr(words)) | |
| seqs[idx].extend(words[1:]) | |
| else: | |
| raise ValueError(f"Expected sequence for {name!r}, got: {line!r}") | |
| # TODO - check the sequence lengths thus far are consistent | |
| # with blocks of 50? | |
| completed_length += 50 | |
| line = handle.readline() | |
| if line.strip(): | |
| raise ValueError(f"Expected blank line, got: {line!r}") | |
| # Skip over any whitespace at the end... | |
| while True: | |
| line = handle.readline() | |
| if not line: | |
| # End of file, no more alignments | |
| break | |
| elif not line.strip(): | |
| # Blank line, ignore | |
| pass | |
| elif line.strip().split()[0] in known_headers: | |
| # Looks like the start of another alignment: | |
| self._header = line | |
| break | |
| else: | |
| raise ValueError(f"Unexpected line after GCG MSF alignment: {line!r}") | |
| # Combine list of strings into single string, remap gaps | |
| seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs] | |
| # Apply any trailing padding for short sequences | |
| padded = False | |
| for idx, (length, s) in enumerate(zip(lengths, seqs)): | |
| if len(s) < aln_length and len(s) == length: | |
| padded = True | |
| seqs[idx] = s + "-" * (aln_length - len(s)) | |
| if padded: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| warnings.warn( | |
| "One of more alignment sequences were truncated and have been gap padded", | |
| BiopythonParserWarning, | |
| ) | |
| records = ( | |
| SeqRecord(Seq(s), id=i, name=i, description=i, annotations={"weight": w}) | |
| for (i, s, w) in zip(ids, seqs, weights) | |
| ) | |
| # This will check alignment lengths are self-consistent: | |
| align = MultipleSeqAlignment(records) | |
| # Check matches the header: | |
| if align.get_alignment_length() != aln_length: | |
| raise ValueError( | |
| "GCG MSF headers said alignment length %i, but have %i" | |
| % (aln_length, align.get_alignment_length()) | |
| ) | |
| return align | |