Spaces:
No application file
No application file
| # Copyright 2008-2016 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. | |
| You are expected to use this module via the Bio.AlignIO functions (or the | |
| Bio.SeqIO functions if you want to work directly with the gapped sequences). | |
| This module contains a parser for the pairwise alignments produced by Bill | |
| Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is | |
| referred to as the "fasta-m10" file format (as we only support the machine | |
| readable output format selected with the -m 10 command line option). | |
| This module does NOT cover the generic "fasta" file format originally | |
| developed as an input format to the FASTA tools. The Bio.AlignIO and | |
| Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, | |
| which can also be used to store a multiple sequence alignments. | |
| """ | |
| from Bio.Align import MultipleSeqAlignment | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| def _extract_alignment_region(alignment_seq_with_flanking, annotation): | |
| """Extract alignment region (PRIVATE). | |
| Helper function for the main parsing code. | |
| To get the actual pairwise alignment sequences, we must first | |
| translate the un-gapped sequence based coordinates into positions | |
| in the gapped sequence (which may have a flanking region shown | |
| using leading - characters). To date, I have never seen any | |
| trailing flanking region shown in the m10 file, but the | |
| following code should also cope with that. | |
| Note that this code seems to work fine even when the "sq_offset" | |
| entries are present as a result of using the -X command line option. | |
| """ | |
| align_stripped = alignment_seq_with_flanking.strip("-") | |
| display_start = int(annotation["al_display_start"]) | |
| if int(annotation["al_start"]) <= int(annotation["al_stop"]): | |
| start = int(annotation["al_start"]) - display_start | |
| end = int(annotation["al_stop"]) - display_start + 1 | |
| else: | |
| # FASTA has flipped this sequence... | |
| start = display_start - int(annotation["al_start"]) | |
| end = display_start - int(annotation["al_stop"]) + 1 | |
| end += align_stripped.count("-") | |
| if start < 0 or start >= end or end > len(align_stripped): | |
| raise ValueError( | |
| "Problem with sequence start/stop,\n%s[%i:%i]\n%s" | |
| % (alignment_seq_with_flanking, start, end, annotation) | |
| ) | |
| return align_stripped[start:end] | |
| def FastaM10Iterator(handle, seq_count=None): | |
| """Alignment iterator for the FASTA tool's pairwise alignment output. | |
| This is for reading the pairwise alignments output by Bill Pearson's | |
| FASTA program when called with the -m 10 command line option for machine | |
| readable output. For more details about the FASTA tools, see the website | |
| http://fasta.bioch.virginia.edu/ and the paper: | |
| W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 | |
| This class is intended to be used via the Bio.AlignIO.parse() function | |
| by specifying the format as "fasta-m10" as shown in the following code:: | |
| from Bio import AlignIO | |
| handle = ... | |
| for a in AlignIO.parse(handle, "fasta-m10"): | |
| assert len(a) == 2, "Should be pairwise!" | |
| print("Alignment length %i" % a.get_alignment_length()) | |
| for record in a: | |
| print("%s %s %s" % (record.seq, record.name, record.id)) | |
| Note that this is not a full blown parser for all the information | |
| in the FASTA output - for example, most of the header and all of the | |
| footer is ignored. Also, the alignments are not batched according to | |
| the input queries. | |
| Also note that there can be up to about 30 letters of flanking region | |
| included in the raw FASTA output as contextual information. This is NOT | |
| part of the alignment itself, and is not included in the resulting | |
| MultipleSeqAlignment objects returned. | |
| """ | |
| state_PREAMBLE = -1 | |
| state_NONE = 0 | |
| state_QUERY_HEADER = 1 | |
| state_ALIGN_HEADER = 2 | |
| state_ALIGN_QUERY = 3 | |
| state_ALIGN_MATCH = 4 | |
| state_ALIGN_CONS = 5 | |
| def build_hsp(): | |
| if not query_tags and not match_tags: | |
| raise ValueError(f"No data for query {query_id!r}, match {match_id!r}") | |
| assert query_tags, query_tags | |
| assert match_tags, match_tags | |
| evalue = align_tags.get("fa_expect") | |
| tool = global_tags.get("tool", "").upper() | |
| q = _extract_alignment_region(query_seq, query_tags) | |
| if tool in ["TFASTX"] and len(match_seq) == len(q): | |
| m = match_seq | |
| # Quick hack until I can work out how -, * and / characters | |
| # and the apparent mix of aa and bp coordinates works. | |
| else: | |
| m = _extract_alignment_region(match_seq, match_tags) | |
| if len(q) != len(m): | |
| raise ValueError( | |
| f"""\ | |
| Darn... amino acids vs nucleotide coordinates? | |
| tool: {tool} | |
| query_seq: {query_seq} | |
| query_tags: {query_tags} | |
| {q} length: {len(q)} | |
| match_seq: {match_seq} | |
| match_tags: {match_tags} | |
| {m} length: {len(m)} | |
| handle.name: {handle.name} | |
| """ | |
| ) | |
| annotations = {} | |
| records = [] | |
| # Want to record both the query header tags, and the alignment tags. | |
| annotations.update(header_tags) | |
| annotations.update(align_tags) | |
| # Query | |
| # ===== | |
| record = SeqRecord( | |
| Seq(q), | |
| id=query_id, | |
| name="query", | |
| description=query_descr, | |
| annotations={"original_length": int(query_tags["sq_len"])}, | |
| ) | |
| # TODO - handle start/end coordinates properly. Short term hack for now: | |
| record._al_start = int(query_tags["al_start"]) | |
| record._al_stop = int(query_tags["al_stop"]) | |
| # TODO - Can FASTA output RNA? | |
| if "sq_type" in query_tags: | |
| if query_tags["sq_type"] == "D": | |
| record.annotations["molecule_type"] = "DNA" | |
| elif query_tags["sq_type"] == "p": | |
| record.annotations["molecule_type"] = "protein" | |
| records.append(record) | |
| # Match | |
| # ===== | |
| record = SeqRecord( | |
| Seq(m), | |
| id=match_id, | |
| name="match", | |
| description=match_descr, | |
| annotations={"original_length": int(match_tags["sq_len"])}, | |
| ) | |
| # TODO - handle start/end coordinates properly. Short term hack for now: | |
| record._al_start = int(match_tags["al_start"]) | |
| record._al_stop = int(match_tags["al_stop"]) | |
| if "sq_type" in match_tags: | |
| if match_tags["sq_type"] == "D": | |
| record.annotations["molecule_type"] = "DNA" | |
| elif match_tags["sq_type"] == "p": | |
| record.annotations["molecule_type"] = "protein" | |
| records.append(record) | |
| return MultipleSeqAlignment(records, annotations=annotations) | |
| state = state_PREAMBLE | |
| query_id = None | |
| match_id = None | |
| query_descr = "" | |
| match_descr = "" | |
| global_tags = {} | |
| header_tags = {} | |
| align_tags = {} | |
| query_tags = {} | |
| match_tags = {} | |
| query_seq = "" | |
| match_seq = "" | |
| cons_seq = "" | |
| for line in handle: | |
| if ">>>" in line and not line.startswith(">>>"): | |
| if query_id and match_id: | |
| # This happens on old FASTA output which lacked an end of | |
| # query >>><<< marker line. | |
| yield build_hsp() | |
| state = state_NONE | |
| query_descr = line[line.find(">>>") + 3 :].strip() | |
| query_id = query_descr.split(None, 1)[0] | |
| match_id = None | |
| header_tags = {} | |
| align_tags = {} | |
| query_tags = {} | |
| match_tags = {} | |
| query_seq = "" | |
| match_seq = "" | |
| cons_seq = "" | |
| elif line.startswith("!! No "): | |
| # e.g. | |
| # !! No library sequences with E() < 0.5 | |
| # or on more recent versions, | |
| # No sequences with E() < 0.05 | |
| assert state == state_NONE | |
| assert not header_tags | |
| assert not align_tags | |
| assert not match_tags | |
| assert not query_tags | |
| assert match_id is None | |
| assert not query_seq | |
| assert not match_seq | |
| assert not cons_seq | |
| query_id = None | |
| elif line.strip() in [">>><<<", ">>>///"]: | |
| # End of query, possible end of all queries | |
| if query_id and match_id: | |
| yield build_hsp() | |
| state = state_NONE | |
| query_id = None | |
| match_id = None | |
| header_tags = {} | |
| align_tags = {} | |
| query_tags = {} | |
| match_tags = {} | |
| query_seq = "" | |
| match_seq = "" | |
| cons_seq = "" | |
| elif line.startswith(">>>"): | |
| # Should be start of a match! | |
| assert query_id is not None | |
| assert line[3:].split(", ", 1)[0] == query_id, line | |
| assert match_id is None | |
| assert not header_tags | |
| assert not align_tags | |
| assert not query_tags | |
| assert not match_tags | |
| assert not match_seq | |
| assert not query_seq | |
| assert not cons_seq | |
| state = state_QUERY_HEADER | |
| elif line.startswith(">>"): | |
| # Should now be at start of a match alignment! | |
| if query_id and match_id: | |
| yield build_hsp() | |
| align_tags = {} | |
| query_tags = {} | |
| match_tags = {} | |
| query_seq = "" | |
| match_seq = "" | |
| cons_seq = "" | |
| match_descr = line[2:].strip() | |
| match_id = match_descr.split(None, 1)[0] | |
| state = state_ALIGN_HEADER | |
| elif line.startswith(">--"): | |
| # End of one HSP | |
| assert query_id and match_id, line | |
| yield build_hsp() | |
| # Clean up read for next HSP | |
| # but reuse header_tags | |
| align_tags = {} | |
| query_tags = {} | |
| match_tags = {} | |
| query_seq = "" | |
| match_seq = "" | |
| cons_seq = "" | |
| state = state_ALIGN_HEADER | |
| elif line.startswith(">"): | |
| if state == state_ALIGN_HEADER: | |
| # Should be start of query alignment seq... | |
| assert query_id is not None, line | |
| assert match_id is not None, line | |
| assert query_id.startswith(line[1:].split(None, 1)[0]), line | |
| state = state_ALIGN_QUERY | |
| elif state == state_ALIGN_QUERY: | |
| # Should be start of match alignment seq | |
| assert query_id is not None, line | |
| assert match_id is not None, line | |
| assert match_id.startswith(line[1:].split(None, 1)[0]), line | |
| state = state_ALIGN_MATCH | |
| elif state == state_NONE: | |
| # Can get > as the last line of a histogram | |
| pass | |
| else: | |
| raise RuntimeError("state %i got %r" % (state, line)) | |
| elif line.startswith("; al_cons"): | |
| assert state == state_ALIGN_MATCH, line | |
| state = state_ALIGN_CONS | |
| # Next line(s) should be consensus seq... | |
| elif line.startswith("; "): | |
| if ": " in line: | |
| key, value = (s.strip() for s in line[2:].split(": ", 1)) | |
| else: | |
| import warnings | |
| from Bio import BiopythonParserWarning | |
| # Seen in lalign36, specifically version 36.3.4 Apr, 2011 | |
| # Fixed in version 36.3.5b Oct, 2011(preload8) | |
| warnings.warn( | |
| f"Missing colon in line: {line!r}", BiopythonParserWarning | |
| ) | |
| try: | |
| key, value = (s.strip() for s in line[2:].split(" ", 1)) | |
| except ValueError: | |
| raise ValueError(f"Bad line: {line!r}") from None | |
| if state == state_QUERY_HEADER: | |
| header_tags[key] = value | |
| elif state == state_ALIGN_HEADER: | |
| align_tags[key] = value | |
| elif state == state_ALIGN_QUERY: | |
| query_tags[key] = value | |
| elif state == state_ALIGN_MATCH: | |
| match_tags[key] = value | |
| else: | |
| raise RuntimeError(f"Unexpected state {state!r}, {line!r}") | |
| elif state == state_ALIGN_QUERY: | |
| query_seq += line.strip() | |
| elif state == state_ALIGN_MATCH: | |
| match_seq += line.strip() | |
| elif state == state_ALIGN_CONS: | |
| cons_seq += line.strip("\n") | |
| elif state == state_PREAMBLE: | |
| if line.startswith("#"): | |
| global_tags["command"] = line[1:].strip() | |
| elif line.startswith(" version "): | |
| global_tags["version"] = line[9:].strip() | |
| elif " compares a " in line: | |
| global_tags["tool"] = line[: line.find(" compares a ")].strip() | |
| elif " searches a " in line: | |
| global_tags["tool"] = line[: line.find(" searches a ")].strip() | |
| else: | |
| pass | |