Spaces:
No application file
No application file
| # Copyright 2006-2016 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. | |
| You are expected to use this module via the Bio.AlignIO functions (or the | |
| Bio.SeqIO functions if you want to work directly with the gapped sequences). | |
| """ | |
| from Bio.Align import MultipleSeqAlignment | |
| from Bio.AlignIO.Interfaces import AlignmentIterator | |
| from Bio.AlignIO.Interfaces import SequentialAlignmentWriter | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| class ClustalWriter(SequentialAlignmentWriter): | |
| """Clustalw alignment writer.""" | |
| def write_alignment(self, alignment): | |
| """Use this to write (another) single alignment to an open file.""" | |
| if len(alignment) == 0: | |
| raise ValueError("Must have at least one sequence") | |
| if alignment.get_alignment_length() == 0: | |
| # This doubles as a check for an alignment object | |
| raise ValueError("Non-empty sequences are required") | |
| # Old versions of the parser in Bio.Clustalw used a ._version property | |
| try: | |
| version = str(alignment._version) | |
| except AttributeError: | |
| version = "" | |
| if not version: | |
| version = "1.81" | |
| if version.startswith("2."): | |
| # e.g. 2.0.x | |
| output = f"CLUSTAL {version} multiple sequence alignment\n\n\n" | |
| else: | |
| # e.g. 1.81 or 1.83 | |
| output = f"CLUSTAL X ({version}) multiple sequence alignment\n\n\n" | |
| cur_char = 0 | |
| max_length = len(alignment[0]) | |
| if max_length <= 0: | |
| raise ValueError("Non-empty sequences are required") | |
| if "clustal_consensus" in alignment.column_annotations: | |
| star_info = alignment.column_annotations["clustal_consensus"] | |
| else: | |
| try: | |
| # This was originally stored by Bio.Clustalw as ._star_info | |
| star_info = alignment._star_info | |
| except AttributeError: | |
| star_info = None | |
| # keep displaying sequences until we reach the end | |
| while cur_char != max_length: | |
| # calculate the number of sequences to show, which will | |
| # be less if we are at the end of the sequence | |
| if (cur_char + 50) > max_length: | |
| show_num = max_length - cur_char | |
| else: | |
| show_num = 50 | |
| # go through all of the records and print out the sequences | |
| # when we output, we do a nice 80 column output, although this | |
| # may result in truncation of the ids. | |
| for record in alignment: | |
| # Make sure we don't get any spaces in the record | |
| # identifier when output in the file by replacing | |
| # them with underscores: | |
| line = record.id[0:30].replace(" ", "_").ljust(36) | |
| line += str(record.seq[cur_char : (cur_char + show_num)]) | |
| output += line + "\n" | |
| # now we need to print out the star info, if we've got it | |
| if star_info: | |
| output += ( | |
| (" " * 36) + star_info[cur_char : (cur_char + show_num)] + "\n" | |
| ) | |
| output += "\n" | |
| cur_char += show_num | |
| # Want a trailing blank new line in case the output is concatenated | |
| self.handle.write(output + "\n") | |
| class ClustalIterator(AlignmentIterator): | |
| """Clustalw alignment iterator.""" | |
| _header = None # for caching lines between __next__ calls | |
| def __next__(self): | |
| """Parse the next alignment from the handle.""" | |
| handle = self.handle | |
| if self._header is None: | |
| line = handle.readline() | |
| else: | |
| # Header we saved from when we were parsing | |
| # the previous alignment. | |
| line = self._header | |
| self._header = None | |
| if not line: | |
| raise StopIteration | |
| # Whitelisted headers we know about | |
| known_headers = [ | |
| "CLUSTAL", | |
| "PROBCONS", | |
| "MUSCLE", | |
| "MSAPROBS", | |
| "Kalign", | |
| "Biopython", | |
| ] | |
| if line.strip().split()[0] not in known_headers: | |
| raise ValueError( | |
| "%s is not a known CLUSTAL header: %s" | |
| % (line.strip().split()[0], ", ".join(known_headers)) | |
| ) | |
| # find the clustal version in the header line | |
| version = None | |
| for word in line.split(): | |
| if word[0] == "(" and word[-1] == ")": | |
| word = word[1:-1] | |
| if word[0] in "0123456789": | |
| version = word | |
| break | |
| # There should be two blank lines after the header line | |
| line = handle.readline() | |
| while line.strip() == "": | |
| line = handle.readline() | |
| # If the alignment contains entries with the same sequence | |
| # identifier (not a good idea - but seems possible), then this | |
| # dictionary based parser will merge their sequences. Fix this? | |
| ids = [] | |
| seqs = [] | |
| consensus = "" | |
| seq_cols = None # Used to extract the consensus | |
| # Use the first block to get the sequence identifiers | |
| while True: | |
| if line[0] != " " and line.strip() != "": | |
| # Sequences identifier... | |
| fields = line.rstrip().split() | |
| # We expect there to be two fields, there can be an optional | |
| # "sequence number" field containing the letter count. | |
| if len(fields) < 2 or len(fields) > 3: | |
| raise ValueError(f"Could not parse line:\n{line}") | |
| ids.append(fields[0]) | |
| seqs.append(fields[1]) | |
| # Record the sequence position to get the consensus | |
| if seq_cols is None: | |
| start = len(fields[0]) + line[len(fields[0]) :].find(fields[1]) | |
| end = start + len(fields[1]) | |
| seq_cols = slice(start, end) | |
| del start, end | |
| assert fields[1] == line[seq_cols] | |
| if len(fields) == 3: | |
| # This MAY be an old style file with a letter count... | |
| try: | |
| letters = int(fields[2]) | |
| except ValueError: | |
| raise ValueError( | |
| f"Could not parse line, bad sequence number:\n{line}" | |
| ) from None | |
| if len(fields[1].replace("-", "")) != letters: | |
| raise ValueError( | |
| f"Could not parse line, invalid sequence number:\n{line}" | |
| ) | |
| elif line[0] == " ": | |
| # Sequence consensus line... | |
| assert len(ids) == len(seqs) | |
| assert len(ids) > 0 | |
| assert seq_cols is not None | |
| consensus = line[seq_cols] | |
| assert not line[: seq_cols.start].strip() | |
| assert not line[seq_cols.stop :].strip() | |
| # Check for blank line (or end of file) | |
| line = handle.readline() | |
| assert line.strip() == "" | |
| break | |
| else: | |
| # No consensus | |
| break | |
| line = handle.readline() | |
| if not line: | |
| break # end of file | |
| assert line.strip() == "" | |
| assert seq_cols is not None | |
| # Confirm all same length | |
| for s in seqs: | |
| assert len(s) == len(seqs[0]) | |
| if consensus: | |
| assert len(consensus) == len(seqs[0]) | |
| # Loop over any remaining blocks... | |
| done = False | |
| while not done: | |
| # There should be a blank line between each block. | |
| # Also want to ignore any consensus line from the | |
| # previous block. | |
| while (not line) or line.strip() == "": | |
| line = handle.readline() | |
| if not line: | |
| break # end of file | |
| if not line: | |
| break # end of file | |
| if line.split(None, 1)[0] in known_headers: | |
| # Found concatenated alignment. | |
| self._header = line | |
| break | |
| for i in range(len(ids)): | |
| if line[0] == " ": | |
| raise ValueError(f"Unexpected line:\n{line!r}") | |
| fields = line.rstrip().split() | |
| # We expect there to be two fields, there can be an optional | |
| # "sequence number" field containing the letter count. | |
| if len(fields) < 2 or len(fields) > 3: | |
| raise ValueError(f"Could not parse line:\n{line!r}") | |
| if fields[0] != ids[i]: | |
| raise ValueError( | |
| "Identifiers out of order? Got '%s' but expected '%s'" | |
| % (fields[0], ids[i]) | |
| ) | |
| if fields[1] != line[seq_cols]: | |
| start = len(fields[0]) + line[len(fields[0]) :].find(fields[1]) | |
| if start != seq_cols.start: | |
| raise ValueError("Old location %s -> %i:XX" % (seq_cols, start)) | |
| end = start + len(fields[1]) | |
| seq_cols = slice(start, end) | |
| del start, end | |
| # Append the sequence | |
| seqs[i] += fields[1] | |
| assert len(seqs[i]) == len(seqs[0]) | |
| if len(fields) == 3: | |
| # This MAY be an old style file with a letter count... | |
| try: | |
| letters = int(fields[2]) | |
| except ValueError: | |
| raise ValueError( | |
| f"Could not parse line, bad sequence number:\n{line}" | |
| ) from None | |
| if len(seqs[i].replace("-", "")) != letters: | |
| raise ValueError( | |
| f"Could not parse line, invalid sequence number:\n{line}" | |
| ) | |
| # Read in the next line | |
| line = handle.readline() | |
| # There should now be a consensus line | |
| if consensus: | |
| assert line[0] == " " | |
| assert seq_cols is not None | |
| consensus += line[seq_cols] | |
| assert len(consensus) == len(seqs[0]) | |
| assert not line[: seq_cols.start].strip() | |
| assert not line[seq_cols.stop :].strip() | |
| # Read in the next line | |
| line = handle.readline() | |
| assert len(ids) == len(seqs) | |
| if len(seqs) == 0 or len(seqs[0]) == 0: | |
| raise StopIteration | |
| if ( | |
| self.records_per_alignment is not None | |
| and self.records_per_alignment != len(ids) | |
| ): | |
| raise ValueError( | |
| "Found %i records in this alignment, told to expect %i" | |
| % (len(ids), self.records_per_alignment) | |
| ) | |
| records = (SeqRecord(Seq(s), id=i, description=i) for (i, s) in zip(ids, seqs)) | |
| alignment = MultipleSeqAlignment(records) | |
| # TODO - Handle alignment annotation better, for now | |
| # mimic the old parser in Bio.Clustalw | |
| if version: | |
| alignment._version = version | |
| if consensus: | |
| alignment_length = len(seqs[0]) | |
| if len(consensus) != alignment_length: | |
| raise ValueError( | |
| "Alignment length is %i, consensus length is %i, '%s'" | |
| % (alignment_length, len(consensus), consensus) | |
| ) | |
| alignment.column_annotations["clustal_consensus"] = consensus | |
| # For backward compatibility prior to .column_annotations: | |
| alignment._star_info = consensus | |
| return alignment | |