Spaces:
No application file
No application file
| # Copyright 2006-2016 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Bio.Align support for the alignment format for input files for PHYLIP tools. | |
| You are expected to use this module via the Bio.Align functions. | |
| """ | |
| from Bio.Align import Alignment | |
| from Bio.Align import interfaces | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| _PHYLIP_ID_WIDTH = 10 | |
| class AlignmentWriter(interfaces.AlignmentWriter): | |
| """Clustalw alignment writer.""" | |
| fmt = "PHYLIP" | |
| def format_alignment(self, alignment): | |
| """Return a string with a single alignment in the Phylip format.""" | |
| names = [] | |
| for record in alignment.sequences: | |
| try: | |
| name = record.id | |
| except AttributeError: | |
| name = "" | |
| else: | |
| name = name.strip() | |
| for char in "[](),": | |
| name = name.replace(char, "") | |
| for char in ":;": | |
| name = name.replace(char, "|") | |
| name = name[:_PHYLIP_ID_WIDTH] | |
| names.append(name) | |
| lines = [] | |
| nseqs, length = alignment.shape | |
| if nseqs == 0: | |
| raise ValueError("Must have at least one sequence") | |
| if length == 0: | |
| raise ValueError("Non-empty sequences are required") | |
| line = "%d %d\n" % (nseqs, length) | |
| lines.append(line) | |
| # From experimentation, the use of tabs is not understood by the | |
| # EMBOSS suite. The nature of the expected white space is not | |
| # defined in the PHYLIP documentation, simply "These are in free | |
| # format, separated by blanks". We'll use spaces to keep EMBOSS | |
| # happy. | |
| for name, sequence in zip(names, alignment): | |
| # Write the entire sequence to one line | |
| line = name[:_PHYLIP_ID_WIDTH].ljust(_PHYLIP_ID_WIDTH) + sequence + "\n" | |
| lines.append(line) | |
| return "".join(lines) | |
| class AlignmentIterator(interfaces.AlignmentIterator): | |
| """Reads a Phylip alignment file and returns an Alignment iterator. | |
| Record names are limited to at most 10 characters. | |
| The parser determines from the file contents if the file format is | |
| sequential or interleaved, and parses the file accordingly. | |
| For more information on the file format, please see: | |
| http://evolution.genetics.washington.edu/phylip/doc/sequence.html | |
| http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles | |
| """ | |
| fmt = "PHYLIP" | |
| def _read_header(self, stream): | |
| try: | |
| line = next(stream) | |
| except StopIteration: | |
| raise ValueError("Empty file.") from None | |
| words = line.split() | |
| if len(words) == 2: | |
| try: | |
| self._number_of_seqs = int(words[0]) | |
| self._length_of_seqs = int(words[1]) | |
| return | |
| except ValueError: | |
| pass | |
| raise ValueError( | |
| "Expected two integers in the first line, received '%s'" % line | |
| ) | |
| def _parse_interleaved_first_block(self, lines, seqs, names): | |
| for line in lines: | |
| line = line.rstrip() | |
| name = line[:_PHYLIP_ID_WIDTH].strip() | |
| seq = line[_PHYLIP_ID_WIDTH:].strip().replace(" ", "") | |
| names.append(name) | |
| seqs.append([seq]) | |
| def _parse_interleaved_other_blocks(self, stream, seqs): | |
| i = 0 | |
| for line in stream: | |
| line = line.rstrip() | |
| if not line: | |
| assert i == self._number_of_seqs | |
| i = 0 | |
| else: | |
| seq = line.replace(" ", "") | |
| seqs[i].append(seq) | |
| i += 1 | |
| if i != 0 and i != self._number_of_seqs: | |
| raise ValueError("Unexpected file format") | |
| def _parse_sequential(self, lines, seqs, names, length): | |
| for line in lines: | |
| if length == 0: | |
| line = line.rstrip() | |
| name = line[:_PHYLIP_ID_WIDTH].strip() | |
| seq = line[_PHYLIP_ID_WIDTH:].strip() | |
| names.append(name) | |
| seqs.append([]) | |
| else: | |
| seq = line.strip() | |
| seq = seq.replace(" ", "") | |
| seqs[-1].append(seq) | |
| length += len(seq) | |
| if length == self._length_of_seqs: | |
| length = 0 | |
| return length | |
| def _read_file(self, stream): | |
| names = [] | |
| seqs = [] | |
| lines = [next(stream) for i in range(self._number_of_seqs)] | |
| try: | |
| line = next(stream) | |
| except StopIteration: | |
| pass | |
| else: | |
| if line.rstrip(): | |
| # sequential file format | |
| lines.append(line) | |
| length = self._parse_sequential(lines, seqs, names, 0) | |
| self._parse_sequential(stream, seqs, names, length) | |
| return names, seqs | |
| # interleaved file format | |
| self._parse_interleaved_first_block(lines, seqs, names) | |
| self._parse_interleaved_other_blocks(stream, seqs) | |
| return names, seqs | |
| def _read_next_alignment(self, stream): | |
| names, seqs = self._read_file(stream) | |
| seqs = ["".join(seq) for seq in seqs] | |
| if len(seqs) != self._number_of_seqs: | |
| raise ValueError( | |
| "Found %i records in this alignment, told to expect %i" | |
| % (len(seqs), self._number_of_seqs) | |
| ) | |
| for seq in seqs: | |
| if len(seq) != self._length_of_seqs: | |
| raise ValueError( | |
| "Expected all sequences to have length %d; found %d" | |
| % (self._length_of_seqs, len(seq)) | |
| ) | |
| if "." in seq: | |
| raise ValueError("PHYLIP format no longer allows dots in sequence") | |
| coordinates = Alignment.infer_coordinates(seqs) | |
| seqs = [seq.replace("-", "") for seq in seqs] | |
| records = [ | |
| SeqRecord(Seq(seq), id=name, description="") | |
| for (name, seq) in zip(names, seqs) | |
| ] | |
| alignment = Alignment(records, coordinates) | |
| del self._number_of_seqs | |
| del self._length_of_seqs | |
| self._close() | |
| return alignment | |