Spaces:
No application file
No application file
| # Copyright 2000, 2004 by Brad Chapman. | |
| # Revisions copyright 2010-2013, 2015-2018 by Peter Cock. | |
| # All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Code for dealing with sequence alignments. | |
| One of the most important things in this module is the MultipleSeqAlignment | |
| class, used in the Bio.AlignIO module. | |
| """ | |
| import sys | |
| import collections | |
| import copy | |
| import importlib | |
| import warnings | |
| import numbers | |
| from itertools import zip_longest | |
| try: | |
| import numpy | |
| except ImportError: | |
| from Bio import MissingPythonDependencyError | |
| raise MissingPythonDependencyError( | |
| "Please install numpy if you want to use Bio.Align. " | |
| "See http://www.numpy.org/" | |
| ) from None | |
| from Bio import BiopythonDeprecationWarning | |
| from Bio.Align import _aligners | |
| from Bio.Align import substitution_matrices | |
| from Bio.Seq import Seq, MutableSeq, reverse_complement, UndefinedSequenceError | |
| from Bio.SeqRecord import SeqRecord, _RestrictedDict | |
| # Import errors may occur here if a compiled aligners.c file | |
| # (_aligners.pyd or _aligners.so) is missing or if the user is | |
| # importing from within the Biopython source tree, see PR #2007: | |
| # https://github.com/biopython/biopython/pull/2007 | |
| AlignmentCounts = collections.namedtuple( | |
| "AlignmentCounts", ["gaps", "identities", "mismatches"] | |
| ) | |
| class MultipleSeqAlignment: | |
| """Represents a classical multiple sequence alignment (MSA). | |
| By this we mean a collection of sequences (usually shown as rows) which | |
| are all the same length (usually with gap characters for insertions or | |
| padding). The data can then be regarded as a matrix of letters, with well | |
| defined columns. | |
| You would typically create an MSA by loading an alignment file with the | |
| AlignIO module: | |
| >>> from Bio import AlignIO | |
| >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") | |
| >>> print(align) | |
| Alignment with 7 rows and 156 columns | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 | |
| TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 | |
| In some respects you can treat these objects as lists of SeqRecord objects, | |
| each representing a row of the alignment. Iterating over an alignment gives | |
| the SeqRecord object for each row: | |
| >>> len(align) | |
| 7 | |
| >>> for record in align: | |
| ... print("%s %i" % (record.id, len(record))) | |
| ... | |
| gi|6273285|gb|AF191659.1|AF191 156 | |
| gi|6273284|gb|AF191658.1|AF191 156 | |
| gi|6273287|gb|AF191661.1|AF191 156 | |
| gi|6273286|gb|AF191660.1|AF191 156 | |
| gi|6273290|gb|AF191664.1|AF191 156 | |
| gi|6273289|gb|AF191663.1|AF191 156 | |
| gi|6273291|gb|AF191665.1|AF191 156 | |
| You can also access individual rows as SeqRecord objects via their index: | |
| >>> print(align[0].id) | |
| gi|6273285|gb|AF191659.1|AF191 | |
| >>> print(align[-1].id) | |
| gi|6273291|gb|AF191665.1|AF191 | |
| And extract columns as strings: | |
| >>> print(align[:, 1]) | |
| AAAAAAA | |
| Or, take just the first ten columns as a sub-alignment: | |
| >>> print(align[:, :10]) | |
| Alignment with 7 rows and 10 columns | |
| TATACATTAA gi|6273285|gb|AF191659.1|AF191 | |
| TATACATTAA gi|6273284|gb|AF191658.1|AF191 | |
| TATACATTAA gi|6273287|gb|AF191661.1|AF191 | |
| TATACATAAA gi|6273286|gb|AF191660.1|AF191 | |
| TATACATTAA gi|6273290|gb|AF191664.1|AF191 | |
| TATACATTAA gi|6273289|gb|AF191663.1|AF191 | |
| TATACATTAA gi|6273291|gb|AF191665.1|AF191 | |
| Combining this alignment slicing with alignment addition allows you to | |
| remove a section of the alignment. For example, taking just the first | |
| and last ten columns: | |
| >>> print(align[:, :10] + align[:, -10:]) | |
| Alignment with 7 rows and 20 columns | |
| TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191 | |
| TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191 | |
| TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191 | |
| TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191 | |
| TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191 | |
| TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191 | |
| TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191 | |
| Note - This object does NOT attempt to model the kind of alignments used | |
| in next generation sequencing with multiple sequencing reads which are | |
| much shorter than the alignment, and where there is usually a consensus or | |
| reference sequence with special status. | |
| """ | |
| def __init__( | |
| self, records, alphabet=None, annotations=None, column_annotations=None | |
| ): | |
| """Initialize a new MultipleSeqAlignment object. | |
| Arguments: | |
| - records - A list (or iterator) of SeqRecord objects, whose | |
| sequences are all the same length. This may be an be an | |
| empty list. | |
| - alphabet - For backward compatibility only; its value should always | |
| be None. | |
| - annotations - Information about the whole alignment (dictionary). | |
| - column_annotations - Per column annotation (restricted dictionary). | |
| This holds Python sequences (lists, strings, tuples) | |
| whose length matches the number of columns. A typical | |
| use would be a secondary structure consensus string. | |
| You would normally load a MSA from a file using Bio.AlignIO, but you | |
| can do this from a list of SeqRecord objects too: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha") | |
| >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta") | |
| >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma") | |
| >>> align = MultipleSeqAlignment([a, b, c], | |
| ... annotations={"tool": "demo"}, | |
| ... column_annotations={"stats": "CCCXCCC"}) | |
| >>> print(align) | |
| Alignment with 3 rows and 7 columns | |
| AAAACGT Alpha | |
| AAA-CGT Beta | |
| AAAAGGT Gamma | |
| >>> align.annotations | |
| {'tool': 'demo'} | |
| >>> align.column_annotations | |
| {'stats': 'CCCXCCC'} | |
| """ | |
| if alphabet is not None: | |
| raise ValueError("The alphabet argument is no longer supported") | |
| self._records = [] | |
| if records: | |
| self.extend(records) | |
| # Annotations about the whole alignment | |
| if annotations is None: | |
| annotations = {} | |
| elif not isinstance(annotations, dict): | |
| raise TypeError("annotations argument should be a dict") | |
| self.annotations = annotations | |
| # Annotations about each column of the alignment | |
| if column_annotations is None: | |
| column_annotations = {} | |
| # Handle this via the property set function which will validate it | |
| self.column_annotations = column_annotations | |
| def _set_per_column_annotations(self, value): | |
| if not isinstance(value, dict): | |
| raise TypeError( | |
| "The per-column-annotations should be a (restricted) dictionary." | |
| ) | |
| # Turn this into a restricted-dictionary (and check the entries) | |
| if len(self): | |
| # Use the standard method to get the length | |
| expected_length = self.get_alignment_length() | |
| self._per_col_annotations = _RestrictedDict(length=expected_length) | |
| self._per_col_annotations.update(value) | |
| else: | |
| # Bit of a problem case... number of columns is undefined | |
| self._per_col_annotations = None | |
| if value: | |
| raise ValueError( | |
| "Can't set per-column-annotations without an alignment" | |
| ) | |
| def _get_per_column_annotations(self): | |
| if self._per_col_annotations is None: | |
| # This happens if empty at initialisation | |
| if len(self): | |
| # Use the standard method to get the length | |
| expected_length = self.get_alignment_length() | |
| else: | |
| # Should this raise an exception? Compare SeqRecord behaviour... | |
| expected_length = 0 | |
| self._per_col_annotations = _RestrictedDict(length=expected_length) | |
| return self._per_col_annotations | |
| column_annotations = property( | |
| fget=_get_per_column_annotations, | |
| fset=_set_per_column_annotations, | |
| doc="""Dictionary of per-letter-annotation for the sequence.""", | |
| ) | |
| def _str_line(self, record, length=50): | |
| """Return a truncated string representation of a SeqRecord (PRIVATE). | |
| This is a PRIVATE function used by the __str__ method. | |
| """ | |
| if record.seq.__class__.__name__ == "CodonSeq": | |
| if len(record.seq) <= length: | |
| return f"{record.seq} {record.id}" | |
| else: | |
| return "%s...%s %s" % ( | |
| record.seq[: length - 3], | |
| record.seq[-3:], | |
| record.id, | |
| ) | |
| else: | |
| if len(record.seq) <= length: | |
| return f"{record.seq} {record.id}" | |
| else: | |
| return "%s...%s %s" % ( | |
| record.seq[: length - 6], | |
| record.seq[-3:], | |
| record.id, | |
| ) | |
| def __str__(self): | |
| """Return a multi-line string summary of the alignment. | |
| This output is intended to be readable, but large alignments are | |
| shown truncated. A maximum of 20 rows (sequences) and 50 columns | |
| are shown, with the record identifiers. This should fit nicely on a | |
| single screen. e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha") | |
| >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta") | |
| >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma") | |
| >>> align = MultipleSeqAlignment([a, b, c]) | |
| >>> print(align) | |
| Alignment with 3 rows and 12 columns | |
| ACTGCTAGCTAG Alpha | |
| ACT-CTAGCTAG Beta | |
| ACTGCTAGATAG Gamma | |
| See also the alignment's format method. | |
| """ | |
| rows = len(self._records) | |
| lines = [ | |
| "Alignment with %i rows and %i columns" | |
| % (rows, self.get_alignment_length()) | |
| ] | |
| if rows <= 20: | |
| lines.extend(self._str_line(rec) for rec in self._records) | |
| else: | |
| lines.extend(self._str_line(rec) for rec in self._records[:18]) | |
| lines.append("...") | |
| lines.append(self._str_line(self._records[-1])) | |
| return "\n".join(lines) | |
| def __repr__(self): | |
| """Return a representation of the object for debugging. | |
| The representation cannot be used with eval() to recreate the object, | |
| which is usually possible with simple python objects. For example: | |
| <Bio.Align.MultipleSeqAlignment instance (2 records of length 14) | |
| at a3c184c> | |
| The hex string is the memory address of the object, see help(id). | |
| This provides a simple way to visually distinguish alignments of | |
| the same size. | |
| """ | |
| # A doctest for __repr__ would be nice, but __class__ comes out differently | |
| # if run via the __main__ trick. | |
| return "<%s instance (%i records of length %i) at %x>" % ( | |
| self.__class__, | |
| len(self._records), | |
| self.get_alignment_length(), | |
| id(self), | |
| ) | |
| # This version is useful for doing eval(repr(alignment)), | |
| # but it can be VERY long: | |
| # return "%s(%r)" \ | |
| # % (self.__class__, self._records) | |
| def __format__(self, format_spec): | |
| """Return the alignment as a string in the specified file format. | |
| The format should be a lower case string supported as an output | |
| format by Bio.AlignIO (such as "fasta", "clustal", "phylip", | |
| "stockholm", etc), which is used to turn the alignment into a | |
| string. | |
| e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha", description="") | |
| >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta", description="") | |
| >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma", description="") | |
| >>> align = MultipleSeqAlignment([a, b, c]) | |
| >>> print(format(align, "fasta")) | |
| >Alpha | |
| ACTGCTAGCTAG | |
| >Beta | |
| ACT-CTAGCTAG | |
| >Gamma | |
| ACTGCTAGATAG | |
| <BLANKLINE> | |
| >>> print(format(align, "phylip")) | |
| 3 12 | |
| Alpha ACTGCTAGCT AG | |
| Beta ACT-CTAGCT AG | |
| Gamma ACTGCTAGAT AG | |
| <BLANKLINE> | |
| """ | |
| if format_spec: | |
| from io import StringIO | |
| from Bio import AlignIO | |
| handle = StringIO() | |
| AlignIO.write([self], handle, format_spec) | |
| return handle.getvalue() | |
| else: | |
| # Follow python convention and default to using __str__ | |
| return str(self) | |
| def __iter__(self): | |
| """Iterate over alignment rows as SeqRecord objects. | |
| e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha") | |
| >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta") | |
| >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma") | |
| >>> align = MultipleSeqAlignment([a, b, c]) | |
| >>> for record in align: | |
| ... print(record.id) | |
| ... print(record.seq) | |
| ... | |
| Alpha | |
| ACTGCTAGCTAG | |
| Beta | |
| ACT-CTAGCTAG | |
| Gamma | |
| ACTGCTAGATAG | |
| """ | |
| return iter(self._records) | |
| def __len__(self): | |
| """Return the number of sequences in the alignment. | |
| Use len(alignment) to get the number of sequences (i.e. the number of | |
| rows), and alignment.get_alignment_length() to get the length of the | |
| longest sequence (i.e. the number of columns). | |
| This is easy to remember if you think of the alignment as being like a | |
| list of SeqRecord objects. | |
| """ | |
| return len(self._records) | |
| def get_alignment_length(self): | |
| """Return the maximum length of the alignment. | |
| All objects in the alignment should (hopefully) have the same | |
| length. This function will go through and find this length | |
| by finding the maximum length of sequences in the alignment. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a = SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha") | |
| >>> b = SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta") | |
| >>> c = SeqRecord(Seq("ACTGCTAGATAG"), id="Gamma") | |
| >>> align = MultipleSeqAlignment([a, b, c]) | |
| >>> align.get_alignment_length() | |
| 12 | |
| If you want to know the number of sequences in the alignment, | |
| use len(align) instead: | |
| >>> len(align) | |
| 3 | |
| """ | |
| max_length = 0 | |
| for record in self._records: | |
| if len(record.seq) > max_length: | |
| max_length = len(record.seq) | |
| return max_length | |
| def extend(self, records): | |
| """Add more SeqRecord objects to the alignment as rows. | |
| They must all have the same length as the original alignment. For | |
| example, | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha") | |
| >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta") | |
| >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma") | |
| >>> d = SeqRecord(Seq("AAAACGT"), id="Delta") | |
| >>> e = SeqRecord(Seq("AAA-GGT"), id="Epsilon") | |
| First we create a small alignment (three rows): | |
| >>> align = MultipleSeqAlignment([a, b, c]) | |
| >>> print(align) | |
| Alignment with 3 rows and 7 columns | |
| AAAACGT Alpha | |
| AAA-CGT Beta | |
| AAAAGGT Gamma | |
| Now we can extend this alignment with another two rows: | |
| >>> align.extend([d, e]) | |
| >>> print(align) | |
| Alignment with 5 rows and 7 columns | |
| AAAACGT Alpha | |
| AAA-CGT Beta | |
| AAAAGGT Gamma | |
| AAAACGT Delta | |
| AAA-GGT Epsilon | |
| Because the alignment object allows iteration over the rows as | |
| SeqRecords, you can use the extend method with a second alignment | |
| (provided its sequences have the same length as the original alignment). | |
| """ | |
| if len(self): | |
| # Use the standard method to get the length | |
| expected_length = self.get_alignment_length() | |
| else: | |
| # Take the first record's length | |
| records = iter(records) # records arg could be list or iterator | |
| try: | |
| rec = next(records) | |
| except StopIteration: | |
| # Special case, no records | |
| return | |
| expected_length = len(rec) | |
| self._append(rec, expected_length) | |
| # Can now setup the per-column-annotations as well, set to None | |
| # while missing the length: | |
| self.column_annotations = {} | |
| # Now continue to the rest of the records as usual | |
| for rec in records: | |
| self._append(rec, expected_length) | |
| def append(self, record): | |
| """Add one more SeqRecord object to the alignment as a new row. | |
| This must have the same length as the original alignment (unless this is | |
| the first record). | |
| >>> from Bio import AlignIO | |
| >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") | |
| >>> print(align) | |
| Alignment with 7 rows and 156 columns | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 | |
| TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 | |
| >>> len(align) | |
| 7 | |
| We'll now construct a dummy record to append as an example: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> dummy = SeqRecord(Seq("N"*156), id="dummy") | |
| Now append this to the alignment, | |
| >>> align.append(dummy) | |
| >>> print(align) | |
| Alignment with 8 rows and 156 columns | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 | |
| TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 | |
| TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 | |
| TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 | |
| NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy | |
| >>> len(align) | |
| 8 | |
| """ | |
| if self._records: | |
| self._append(record, self.get_alignment_length()) | |
| else: | |
| self._append(record) | |
| def _append(self, record, expected_length=None): | |
| """Validate and append a record (PRIVATE).""" | |
| if not isinstance(record, SeqRecord): | |
| raise TypeError("New sequence is not a SeqRecord object") | |
| # Currently the get_alignment_length() call is expensive, so we need | |
| # to avoid calling it repeatedly for __init__ and extend, hence this | |
| # private _append method | |
| if expected_length is not None and len(record) != expected_length: | |
| # TODO - Use the following more helpful error, but update unit tests | |
| # raise ValueError("New sequence is not of length %i" | |
| # % self.get_alignment_length()) | |
| raise ValueError("Sequences must all be the same length") | |
| self._records.append(record) | |
| def __add__(self, other): | |
| """Combine two alignments with the same number of rows by adding them. | |
| If you have two multiple sequence alignments (MSAs), there are two ways to think | |
| about adding them - by row or by column. Using the extend method adds by row. | |
| Using the addition operator adds by column. For example, | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a1 = SeqRecord(Seq("AAAAC"), id="Alpha") | |
| >>> b1 = SeqRecord(Seq("AAA-C"), id="Beta") | |
| >>> c1 = SeqRecord(Seq("AAAAG"), id="Gamma") | |
| >>> a2 = SeqRecord(Seq("GT"), id="Alpha") | |
| >>> b2 = SeqRecord(Seq("GT"), id="Beta") | |
| >>> c2 = SeqRecord(Seq("GT"), id="Gamma") | |
| >>> left = MultipleSeqAlignment([a1, b1, c1], | |
| ... annotations={"tool": "demo", "name": "start"}, | |
| ... column_annotations={"stats": "CCCXC"}) | |
| >>> right = MultipleSeqAlignment([a2, b2, c2], | |
| ... annotations={"tool": "demo", "name": "end"}, | |
| ... column_annotations={"stats": "CC"}) | |
| Now, let's look at these two alignments: | |
| >>> print(left) | |
| Alignment with 3 rows and 5 columns | |
| AAAAC Alpha | |
| AAA-C Beta | |
| AAAAG Gamma | |
| >>> print(right) | |
| Alignment with 3 rows and 2 columns | |
| GT Alpha | |
| GT Beta | |
| GT Gamma | |
| And add them: | |
| >>> combined = left + right | |
| >>> print(combined) | |
| Alignment with 3 rows and 7 columns | |
| AAAACGT Alpha | |
| AAA-CGT Beta | |
| AAAAGGT Gamma | |
| For this to work, both alignments must have the same number of records (here | |
| they both have 3 rows): | |
| >>> len(left) | |
| 3 | |
| >>> len(right) | |
| 3 | |
| >>> len(combined) | |
| 3 | |
| The individual rows are SeqRecord objects, and these can be added together. Refer | |
| to the SeqRecord documentation for details of how the annotation is handled. This | |
| example is a special case in that both original alignments shared the same names, | |
| meaning when the rows are added they also get the same name. | |
| Any common annotations are preserved, but differing annotation is lost. This is | |
| the same behaviour used in the SeqRecord annotations and is designed to prevent | |
| accidental propagation of inappropriate values: | |
| >>> combined.annotations | |
| {'tool': 'demo'} | |
| Similarly any common per-column-annotations are combined: | |
| >>> combined.column_annotations | |
| {'stats': 'CCCXCCC'} | |
| """ | |
| if not isinstance(other, MultipleSeqAlignment): | |
| raise NotImplementedError | |
| if len(self) != len(other): | |
| raise ValueError( | |
| "When adding two alignments they must have the same length" | |
| " (i.e. same number or rows)" | |
| ) | |
| merged = (left + right for left, right in zip(self, other)) | |
| # Take any common annotation: | |
| annotations = {} | |
| for k, v in self.annotations.items(): | |
| if k in other.annotations and other.annotations[k] == v: | |
| annotations[k] = v | |
| column_annotations = {} | |
| for k, v in self.column_annotations.items(): | |
| if k in other.column_annotations: | |
| column_annotations[k] = v + other.column_annotations[k] | |
| return MultipleSeqAlignment( | |
| merged, annotations=annotations, column_annotations=column_annotations | |
| ) | |
| def __getitem__(self, index): | |
| """Access part of the alignment. | |
| Depending on the indices, you can get a SeqRecord object | |
| (representing a single row), a Seq object (for a single columns), | |
| a string (for a single characters) or another alignment | |
| (representing some part or all of the alignment). | |
| align[r,c] gives a single character as a string | |
| align[r] gives a row as a SeqRecord | |
| align[r,:] gives a row as a SeqRecord | |
| align[:,c] gives a column as a Seq | |
| align[:] and align[:,:] give a copy of the alignment | |
| Anything else gives a sub alignment, e.g. | |
| align[0:2] or align[0:2,:] uses only row 0 and 1 | |
| align[:,1:3] uses only columns 1 and 2 | |
| align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2 | |
| We'll use the following example alignment here for illustration: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> a = SeqRecord(Seq("AAAACGT"), id="Alpha") | |
| >>> b = SeqRecord(Seq("AAA-CGT"), id="Beta") | |
| >>> c = SeqRecord(Seq("AAAAGGT"), id="Gamma") | |
| >>> d = SeqRecord(Seq("AAAACGT"), id="Delta") | |
| >>> e = SeqRecord(Seq("AAA-GGT"), id="Epsilon") | |
| >>> align = MultipleSeqAlignment([a, b, c, d, e]) | |
| You can access a row of the alignment as a SeqRecord using an integer | |
| index (think of the alignment as a list of SeqRecord objects here): | |
| >>> first_record = align[0] | |
| >>> print("%s %s" % (first_record.id, first_record.seq)) | |
| Alpha AAAACGT | |
| >>> last_record = align[-1] | |
| >>> print("%s %s" % (last_record.id, last_record.seq)) | |
| Epsilon AAA-GGT | |
| You can also access use python's slice notation to create a sub-alignment | |
| containing only some of the SeqRecord objects: | |
| >>> sub_alignment = align[2:5] | |
| >>> print(sub_alignment) | |
| Alignment with 3 rows and 7 columns | |
| AAAAGGT Gamma | |
| AAAACGT Delta | |
| AAA-GGT Epsilon | |
| This includes support for a step, i.e. align[start:end:step], which | |
| can be used to select every second sequence: | |
| >>> sub_alignment = align[::2] | |
| >>> print(sub_alignment) | |
| Alignment with 3 rows and 7 columns | |
| AAAACGT Alpha | |
| AAAAGGT Gamma | |
| AAA-GGT Epsilon | |
| Or to get a copy of the alignment with the rows in reverse order: | |
| >>> rev_alignment = align[::-1] | |
| >>> print(rev_alignment) | |
| Alignment with 5 rows and 7 columns | |
| AAA-GGT Epsilon | |
| AAAACGT Delta | |
| AAAAGGT Gamma | |
| AAA-CGT Beta | |
| AAAACGT Alpha | |
| You can also use two indices to specify both rows and columns. Using simple | |
| integers gives you the entry as a single character string. e.g. | |
| >>> align[3, 4] | |
| 'C' | |
| This is equivalent to: | |
| >>> align[3][4] | |
| 'C' | |
| or: | |
| >>> align[3].seq[4] | |
| 'C' | |
| To get a single column (as a string) use this syntax: | |
| >>> align[:, 4] | |
| 'CCGCG' | |
| Or, to get part of a column, | |
| >>> align[1:3, 4] | |
| 'CG' | |
| However, in general you get a sub-alignment, | |
| >>> print(align[1:5, 3:6]) | |
| Alignment with 4 rows and 3 columns | |
| -CG Beta | |
| AGG Gamma | |
| ACG Delta | |
| -GG Epsilon | |
| This should all seem familiar to anyone who has used the NumPy | |
| array or matrix objects. | |
| """ | |
| if isinstance(index, int): | |
| # e.g. result = align[x] | |
| # Return a SeqRecord | |
| return self._records[index] | |
| elif isinstance(index, slice): | |
| # e.g. sub_align = align[i:j:k] | |
| new = MultipleSeqAlignment(self._records[index]) | |
| if self.column_annotations and len(new) == len(self): | |
| # All rows kept (although could have been reversed) | |
| # Preserve the column annotations too, | |
| for k, v in self.column_annotations.items(): | |
| new.column_annotations[k] = v | |
| return new | |
| elif len(index) != 2: | |
| raise TypeError("Invalid index type.") | |
| # Handle double indexing | |
| row_index, col_index = index | |
| if isinstance(row_index, int): | |
| # e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord | |
| return self._records[row_index][col_index] | |
| elif isinstance(col_index, int): | |
| # e.g. col_or_part_col = align[1:5, 6], gives a string | |
| return "".join(rec[col_index] for rec in self._records[row_index]) | |
| else: | |
| # e.g. sub_align = align[1:4, 5:7], gives another alignment | |
| new = MultipleSeqAlignment( | |
| rec[col_index] for rec in self._records[row_index] | |
| ) | |
| if self.column_annotations and len(new) == len(self): | |
| # All rows kept (although could have been reversed) | |
| # Preserve the column annotations too, | |
| for k, v in self.column_annotations.items(): | |
| new.column_annotations[k] = v[col_index] | |
| return new | |
| def sort(self, key=None, reverse=False): | |
| """Sort the rows (SeqRecord objects) of the alignment in place. | |
| This sorts the rows alphabetically using the SeqRecord object id by | |
| default. The sorting can be controlled by supplying a key function | |
| which must map each SeqRecord to a sort value. | |
| This is useful if you want to add two alignments which use the same | |
| record identifiers, but in a different order. For example, | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> align1 = MultipleSeqAlignment([ | |
| ... SeqRecord(Seq("ACGT"), id="Human"), | |
| ... SeqRecord(Seq("ACGG"), id="Mouse"), | |
| ... SeqRecord(Seq("ACGC"), id="Chicken"), | |
| ... ]) | |
| >>> align2 = MultipleSeqAlignment([ | |
| ... SeqRecord(Seq("CGGT"), id="Mouse"), | |
| ... SeqRecord(Seq("CGTT"), id="Human"), | |
| ... SeqRecord(Seq("CGCT"), id="Chicken"), | |
| ... ]) | |
| If you simple try and add these without sorting, you get this: | |
| >>> print(align1 + align2) | |
| Alignment with 3 rows and 8 columns | |
| ACGTCGGT <unknown id> | |
| ACGGCGTT <unknown id> | |
| ACGCCGCT Chicken | |
| Consult the SeqRecord documentation which explains why you get a | |
| default value when annotation like the identifier doesn't match up. | |
| However, if we sort the alignments first, then add them we get the | |
| desired result: | |
| >>> align1.sort() | |
| >>> align2.sort() | |
| >>> print(align1 + align2) | |
| Alignment with 3 rows and 8 columns | |
| ACGCCGCT Chicken | |
| ACGTCGTT Human | |
| ACGGCGGT Mouse | |
| As an example using a different sort order, you could sort on the | |
| GC content of each sequence. | |
| >>> from Bio.SeqUtils import gc_fraction | |
| >>> print(align1) | |
| Alignment with 3 rows and 4 columns | |
| ACGC Chicken | |
| ACGT Human | |
| ACGG Mouse | |
| >>> align1.sort(key = lambda record: gc_fraction(record.seq)) | |
| >>> print(align1) | |
| Alignment with 3 rows and 4 columns | |
| ACGT Human | |
| ACGC Chicken | |
| ACGG Mouse | |
| There is also a reverse argument, so if you wanted to sort by ID | |
| but backwards: | |
| >>> align1.sort(reverse=True) | |
| >>> print(align1) | |
| Alignment with 3 rows and 4 columns | |
| ACGG Mouse | |
| ACGT Human | |
| ACGC Chicken | |
| """ | |
| if key is None: | |
| self._records.sort(key=lambda r: r.id, reverse=reverse) | |
| else: | |
| self._records.sort(key=key, reverse=reverse) | |
| def substitutions(self): | |
| """Return an Array with the number of substitutions of letters in the alignment. | |
| As an example, consider a multiple sequence alignment of three DNA sequences: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.Align import MultipleSeqAlignment | |
| >>> seq1 = SeqRecord(Seq("ACGT"), id="seq1") | |
| >>> seq2 = SeqRecord(Seq("A--A"), id="seq2") | |
| >>> seq3 = SeqRecord(Seq("ACGT"), id="seq3") | |
| >>> seq4 = SeqRecord(Seq("TTTC"), id="seq4") | |
| >>> alignment = MultipleSeqAlignment([seq1, seq2, seq3, seq4]) | |
| >>> print(alignment) | |
| Alignment with 4 rows and 4 columns | |
| ACGT seq1 | |
| A--A seq2 | |
| ACGT seq3 | |
| TTTC seq4 | |
| >>> m = alignment.substitutions | |
| >>> print(m) | |
| A C G T | |
| A 3.0 0.5 0.0 2.5 | |
| C 0.5 1.0 0.0 2.0 | |
| G 0.0 0.0 1.0 1.0 | |
| T 2.5 2.0 1.0 1.0 | |
| <BLANKLINE> | |
| Note that the matrix is symmetric, with counts divided equally on both | |
| sides of the diagonal. For example, the total number of substitutions | |
| between A and T in the alignment is 3.5 + 3.5 = 7. | |
| Any weights associated with the sequences are taken into account when | |
| calculating the substitution matrix. For example, given the following | |
| multiple sequence alignment:: | |
| GTATC 0.5 | |
| AT--C 0.8 | |
| CTGTC 1.0 | |
| For the first column we have:: | |
| ('A', 'G') : 0.5 * 0.8 = 0.4 | |
| ('C', 'G') : 0.5 * 1.0 = 0.5 | |
| ('A', 'C') : 0.8 * 1.0 = 0.8 | |
| """ | |
| letters = set.union(*(set(record.seq) for record in self)) | |
| try: | |
| letters.remove("-") | |
| except KeyError: | |
| pass | |
| letters = "".join(sorted(letters)) | |
| m = substitution_matrices.Array(letters, dims=2) | |
| for rec_num1, alignment1 in enumerate(self): | |
| seq1 = alignment1.seq | |
| weight1 = alignment1.annotations.get("weight", 1.0) | |
| for rec_num2, alignment2 in enumerate(self): | |
| if rec_num1 == rec_num2: | |
| break | |
| seq2 = alignment2.seq | |
| weight2 = alignment2.annotations.get("weight", 1.0) | |
| for residue1, residue2 in zip(seq1, seq2): | |
| if residue1 == "-": | |
| continue | |
| if residue2 == "-": | |
| continue | |
| m[(residue1, residue2)] += weight1 * weight2 | |
| m += m.transpose() | |
| m /= 2.0 | |
| return m | |
| class Alignment: | |
| """Represents a sequence alignment. | |
| An Alignment object has a `.sequences` attribute storing the sequences | |
| (Seq, MutableSeq, SeqRecord, or string objects) that were aligned, as well | |
| as a `.coordinates` attribute storing the sequence coordinates defining the | |
| alignment as a numpy array. | |
| Other commonly used attributes (which may or may not be present) are: | |
| - annotations - A dictionary with annotations describing the | |
| alignment; | |
| - column_annotations - A dictionary with annotations describing each | |
| column in the alignment; | |
| - score - The alignment score. | |
| """ | |
| def infer_coordinates(cls, lines, skipped_columns=None): | |
| """Infer the coordinates from a printed alignment. | |
| This method is primarily employed in Biopython's alignment parsers, | |
| though it may be useful for other purposes. | |
| For an alignment consisting of N sequences, printed as N lines with | |
| the same number of columns, where gaps are represented by dashes, | |
| this method will calculate the sequence coordinates that define the | |
| alignment. The coordinates are returned as a numpy array of integers, | |
| and can be used to create an Alignment object. | |
| The argument skipped columns should be None (the default) or an empty | |
| list. If skipped_columns is a list, then the indices of any columns in | |
| the alignment with a gap in all lines are appended to skipped_columns. | |
| This is an example for the alignment of three sequences TAGGCATACGTG, | |
| AACGTACGT, and ACGCATACTTG, with gaps in the second and third sequence: | |
| >>> from Bio.Align import Alignment | |
| >>> lines = ["TAGGCATACGTG", | |
| ... "AACG--TACGT-", | |
| ... "-ACGCATACTTG", | |
| ... ] | |
| >>> sequences = [line.replace("-", "") for line in lines] | |
| >>> sequences | |
| ['TAGGCATACGTG', 'AACGTACGT', 'ACGCATACTTG'] | |
| >>> coordinates = Alignment.infer_coordinates(lines) | |
| >>> coordinates | |
| array([[ 0, 1, 4, 6, 11, 12], | |
| [ 0, 1, 4, 4, 9, 9], | |
| [ 0, 0, 3, 5, 10, 11]]) | |
| >>> alignment = Alignment(sequences, coordinates) | |
| """ | |
| n = len(lines) | |
| m = len(lines[0]) | |
| for line in lines: | |
| assert m == len(line) | |
| path = [] | |
| if m > 0: | |
| indices = [0] * n | |
| current_state = [None] * n | |
| for i in range(m): | |
| next_state = [line[i] != "-" for line in lines] | |
| if not any(next_state): | |
| # skip columns in which all rows have a gap | |
| if skipped_columns is not None: | |
| skipped_columns.append(i) | |
| elif next_state == current_state: | |
| step += 1 # noqa: F821 | |
| else: | |
| indices = [ | |
| index + step if state else index | |
| for index, state in zip(indices, current_state) | |
| ] | |
| path.append(indices) | |
| step = 1 | |
| current_state = next_state | |
| indices = [ | |
| index + step if state else index | |
| for index, state in zip(indices, current_state) | |
| ] | |
| path.append(indices) | |
| coordinates = numpy.array(path).transpose() | |
| return coordinates | |
| def __init__(self, sequences, coordinates=None): | |
| """Initialize a new Alignment object. | |
| Arguments: | |
| - sequences - A list of the sequences (Seq, MutableSeq, SeqRecord, | |
| or string objects) that were aligned. | |
| - coordinates - The sequence coordinates that define the alignment. | |
| If None (the default value), assume that the sequences | |
| align to each other without any gaps. | |
| """ | |
| self.sequences = sequences | |
| if coordinates is None: | |
| try: | |
| lengths = {len(sequence) for sequence in sequences} | |
| except TypeError: | |
| # this may happen if sequences contain a SeqRecord where | |
| # the seq attribute is None, as neither the sequence nor | |
| # its length are known. | |
| pass | |
| else: | |
| if len(lengths) == 0: | |
| coordinates = numpy.empty((0, 0), dtype=int) | |
| elif len(lengths) == 1: | |
| length = lengths.pop() | |
| coordinates = numpy.array([[0, length]] * len(sequences)) | |
| else: | |
| raise ValueError( | |
| "sequences must have the same length if coordinates is None" | |
| ) | |
| self.coordinates = coordinates | |
| def __array__(self, dtype=None): | |
| coordinates = self.coordinates.copy() | |
| sequences = list(self.sequences) | |
| steps = numpy.diff(self.coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| for i, sequence in enumerate(sequences): | |
| row = steps[i, aligned] | |
| if (row >= 0).all(): | |
| pass | |
| elif (row <= 0).all(): | |
| sequences[i] = reverse_complement(sequence, inplace=False) | |
| coordinates[i, :] = len(sequence) - coordinates[i, :] | |
| steps[i, :] = -steps[i, :] | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| gaps = steps.max(0) | |
| if not ((steps == gaps) | (steps <= 0)).all(): | |
| raise ValueError("Unequal step sizes in alignment") | |
| n = len(steps) | |
| m = sum(gaps) | |
| data = numpy.empty((n, m), "S1") | |
| for i in range(n): | |
| sequence = sequences[i] | |
| k = coordinates[i, 0] | |
| m = 0 | |
| for step, gap in zip(steps[i], gaps): | |
| if step > 0: | |
| j = k + step | |
| n = m + step | |
| try: | |
| subsequence = bytes(sequence[k:j]) | |
| except TypeError: # str | |
| subsequence = bytes(sequence[k:j], "UTF8") | |
| data[i, :].data.cast("B")[m:n] = subsequence | |
| k = j | |
| m = n | |
| elif step < 0: | |
| k += step | |
| else: # step == 0 | |
| n = m + gap | |
| data[i, m:n] = b"-" | |
| m = n | |
| if dtype is not None: | |
| data = numpy.array(data, dtype) | |
| return data | |
| def target(self): | |
| """Return self.sequences[0] for a pairwise alignment.""" | |
| n = len(self.sequences) | |
| if n != 2: | |
| raise ValueError( | |
| "self.target is defined for pairwise alignments only (found alignment of %d sequences)" | |
| % n | |
| ) | |
| return self.sequences[0] | |
| def target(self, value): | |
| """For a pairwise alignment, set self.sequences[0].""" | |
| n = len(self.sequences) | |
| if n != 2: | |
| raise ValueError( | |
| "self.target is defined for pairwise alignments only (found alignment of %d sequences)" | |
| % n | |
| ) | |
| self.sequences[0] = value | |
| def query(self): | |
| """Return self.sequences[1] for a pairwise alignment.""" | |
| n = len(self.sequences) | |
| if n != 2: | |
| raise ValueError( | |
| "self.query is defined for pairwise alignments only (found alignment of %d sequences)" | |
| % n | |
| ) | |
| return self.sequences[1] | |
| def query(self, value): | |
| """For a pairwise alignment, set self.sequences[1].""" | |
| n = len(self.sequences) | |
| if n != 2: | |
| raise ValueError( | |
| "self.query is defined for pairwise alignments only (found alignment of %d sequences)" | |
| % n | |
| ) | |
| self.sequences[1] = value | |
| def __eq__(self, other): | |
| """Check if two Alignment objects specify the same alignment.""" | |
| for left, right in zip_longest(self.sequences, other.sequences): | |
| try: | |
| left = left.seq | |
| except AttributeError: | |
| pass | |
| try: | |
| right = right.seq | |
| except AttributeError: | |
| pass | |
| if left != right: | |
| return False | |
| return numpy.array_equal(self.coordinates, other.coordinates) | |
| def __ne__(self, other): | |
| """Check if two Alignment objects have different alignments.""" | |
| for left, right in zip_longest(self.sequences, other.sequences): | |
| try: | |
| left = left.seq | |
| except AttributeError: | |
| pass | |
| try: | |
| right = right.seq | |
| except AttributeError: | |
| pass | |
| if left != right: | |
| return True | |
| return not numpy.array_equal(self.coordinates, other.coordinates) | |
| def __lt__(self, other): | |
| """Check if self should come before other.""" | |
| for left, right in zip_longest(self.sequences, other.sequences): | |
| try: | |
| left = left.seq | |
| except AttributeError: | |
| pass | |
| try: | |
| right = right.seq | |
| except AttributeError: | |
| pass | |
| if left < right: | |
| return True | |
| if left > right: | |
| return False | |
| for left, right in zip( | |
| self.coordinates.transpose(), other.coordinates.transpose() | |
| ): | |
| left, right = tuple(left), tuple(right) | |
| if left < right: | |
| return True | |
| if left > right: | |
| return False | |
| return False | |
| def __le__(self, other): | |
| """Check if self should come before or is equal to other.""" | |
| for left, right in zip_longest(self.sequences, other.sequences): | |
| try: | |
| left = left.seq | |
| except AttributeError: | |
| pass | |
| try: | |
| right = right.seq | |
| except AttributeError: | |
| pass | |
| if left < right: | |
| return True | |
| if left > right: | |
| return False | |
| for left, right in zip( | |
| self.coordinates.transpose(), other.coordinates.transpose() | |
| ): | |
| left, right = tuple(left), tuple(right) | |
| if left < right: | |
| return True | |
| if left > right: | |
| return False | |
| return True | |
| def __gt__(self, other): | |
| """Check if self should come after other.""" | |
| for left, right in zip_longest(self.sequences, other.sequences): | |
| try: | |
| left = left.seq | |
| except AttributeError: | |
| pass | |
| try: | |
| right = right.seq | |
| except AttributeError: | |
| pass | |
| if left < right: | |
| return False | |
| if left > right: | |
| return True | |
| for left, right in zip( | |
| self.coordinates.transpose(), other.coordinates.transpose() | |
| ): | |
| left, right = tuple(left), tuple(right) | |
| if left > right: | |
| return True | |
| if left < right: | |
| return False | |
| return False | |
| def __ge__(self, other): | |
| """Check if self should come after or is equal to other.""" | |
| for left, right in zip_longest(self.sequences, other.sequences): | |
| try: | |
| left = left.seq | |
| except AttributeError: | |
| pass | |
| try: | |
| right = right.seq | |
| except AttributeError: | |
| pass | |
| if left < right: | |
| return False | |
| if left > right: | |
| return True | |
| for left, right in zip( | |
| self.coordinates.transpose(), other.coordinates.transpose() | |
| ): | |
| left, right = tuple(left), tuple(right) | |
| if left > right: | |
| return True | |
| if left < right: | |
| return False | |
| return True | |
| def path(self): | |
| """Return the path through the trace matrix.""" | |
| warnings.warn( | |
| "The path attribute is deprecated; please use the coordinates " | |
| "attribute instead. The coordinates attribute is a numpy array " | |
| "containing the same values as the path attributes, after " | |
| "transposition.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return tuple(tuple(row) for row in self.coordinates.transpose()) | |
| def path(self, value): | |
| warnings.warn( | |
| "The path attribute is deprecated; please use the coordinates " | |
| "attribute instead. The coordinates attribute is a numpy array " | |
| "containing the same values as the path attributes, after " | |
| "transposition.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| self.coordinates = numpy.array(value).transpose() | |
| def _get_row(self, index): | |
| """Return self[index], where index is an integer (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[row] | |
| where row is an integer. | |
| Return value is a string if the aligned sequences are string, Seq, | |
| or SeqRecord objects, otherwise the return value is a list. | |
| """ | |
| steps = numpy.diff(self.coordinates, 1) | |
| n = len(steps) | |
| if index < 0: | |
| index += n | |
| if index < 0: | |
| raise IndexError("row index out of range") | |
| elif index >= n: | |
| raise IndexError("row index out of range") | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| coordinates = self.coordinates[index, :] | |
| sequence = self.sequences[index] | |
| for i in range(n): | |
| row = steps[i, aligned] | |
| if (row >= 0).all(): | |
| pass | |
| elif (row <= 0).all(): | |
| steps[i, :] = -steps[i, :] | |
| if i == index: | |
| sequence = reverse_complement(sequence, inplace=False) | |
| coordinates = len(sequence) - coordinates | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {index}") | |
| gaps = steps.max(0) | |
| if not ((steps == gaps) | (steps <= 0)).all(): | |
| raise ValueError("Unequal step sizes in alignment") | |
| try: | |
| sequence = sequence.seq # SeqRecord confusion | |
| except AttributeError: | |
| pass | |
| steps = steps[index] | |
| k = coordinates[0] | |
| if isinstance(sequence, (str, Seq)): | |
| line = "" | |
| for step, gap in zip(steps, gaps): | |
| if step > 0: | |
| j = k + step | |
| line += str(sequence[k:j]) | |
| k = j | |
| elif step < 0: | |
| k += step | |
| else: # step == 0 | |
| line += "-" * gap | |
| else: | |
| line = [] | |
| for step, gap in zip(steps, gaps): | |
| if step > 0: | |
| j = k + step | |
| line.extend(sequence[k:j]) | |
| k = j | |
| else: | |
| line.extend([None] * gap) | |
| return line | |
| def _get_rows(self, key): | |
| """Return self[key], where key is a slice object (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[rows] | |
| where rows is a slice object. Return value is an Alignment object. | |
| """ | |
| sequences = self.sequences[key] | |
| coordinates = self.coordinates[key].copy() | |
| alignment = Alignment(sequences, coordinates) | |
| if numpy.array_equal(self.coordinates, coordinates): | |
| try: | |
| alignment.score = self.score | |
| except AttributeError: | |
| pass | |
| try: | |
| alignment.column_annotations = self.column_annotations | |
| except AttributeError: | |
| pass | |
| return alignment | |
| def _get_row_col(self, j, col, steps, gaps, sequence): | |
| """Return the sequence contents at alignment column j (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[row, col] | |
| where both row and col are integers. | |
| Return value is a string of length 1. | |
| """ | |
| indices = gaps.cumsum() | |
| index = indices.searchsorted(col, side="right") | |
| if steps[index]: | |
| offset = col - indices[index] | |
| j += sum(steps[: index + 1]) + offset | |
| return sequence[j] | |
| else: | |
| return "-" | |
| def _get_row_cols_slice( | |
| self, coordinate, start_index, stop_index, steps, gaps, sequence | |
| ): | |
| """Return the alignment contents of one row and consecutive columns (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[row, cols] | |
| where row is an integer and cols is a slice object with step 1. | |
| Return value is a string if the aligned sequences are string, Seq, | |
| or SeqRecord objects, otherwise the return value is a list. | |
| """ | |
| indices = gaps.cumsum() | |
| i = indices.searchsorted(start_index, side="right") | |
| j = i + indices[i:].searchsorted(stop_index, side="right") | |
| try: | |
| sequence = sequence.seq # stupid SeqRecord | |
| except AttributeError: | |
| pass | |
| if isinstance(sequence, (str, Seq)): | |
| if i == j: | |
| length = stop_index - start_index | |
| if steps[i] == 0: | |
| line = "-" * length | |
| else: | |
| start = coordinate[i] + start_index - indices[i - 1] | |
| stop = start + length | |
| line = str(sequence[start:stop]) | |
| else: | |
| length = indices[i] - start_index | |
| if steps[i] == 0: | |
| line = "-" * length | |
| else: | |
| stop = coordinate[i + 1] | |
| start = stop - length | |
| line = str(sequence[start:stop]) | |
| i += 1 | |
| while i < j: | |
| step = gaps[i] | |
| if steps[i] == 0: | |
| line += "-" * step | |
| else: | |
| start = coordinate[i] | |
| stop = coordinate[i + 1] | |
| line += str(sequence[start:stop]) | |
| i += 1 | |
| length = stop_index - indices[i - 1] | |
| if length > 0: | |
| if steps[i] == 0: | |
| line += "-" * length | |
| else: | |
| start = coordinate[i] | |
| stop = start + length | |
| line += str(sequence[start:stop]) | |
| else: | |
| if i == j: | |
| length = stop_index - start_index | |
| if steps[i] == 0: | |
| line = [None] * length | |
| else: | |
| start = coordinate[i] + start_index - indices[i - 1] | |
| stop = start + length | |
| line = sequence[start:stop] | |
| else: | |
| length = indices[i] - start_index | |
| if steps[i] == 0: | |
| line = [None] * length | |
| else: | |
| stop = coordinate[i + 1] | |
| start = stop - length | |
| line = sequence[start:stop] | |
| i += 1 | |
| while i < j: | |
| step = gaps[i] | |
| if steps[i] == 0: | |
| line.extend([None] * step) | |
| else: | |
| start = coordinate[i] | |
| stop = coordinate[i + 1] | |
| line.extend(sequence[start:stop]) | |
| i += 1 | |
| length = stop_index - indices[i - 1] | |
| if length > 0: | |
| if steps[j] == 0: | |
| line.extend([None] * length) | |
| else: | |
| start = coordinate[i] | |
| stop = start + length | |
| line.extend(sequence[start:stop]) | |
| return line | |
| def _get_row_cols_iterable(self, coordinate, cols, gaps, sequence): | |
| """Return the alignment contents of one row and multiple columns (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[row, cols] | |
| where row is an integer and cols is an iterable of integers. | |
| Return value is a string if the aligned sequences are string, Seq, | |
| or SeqRecord objects, otherwise the return value is a list. | |
| """ | |
| try: | |
| sequence = sequence.seq # stupid SeqRecord | |
| except AttributeError: | |
| pass | |
| if isinstance(sequence, (str, Seq)): | |
| line = "" | |
| start = coordinate[0] | |
| for end, gap in zip(coordinate[1:], gaps): | |
| if start < end: | |
| line += str(sequence[start:end]) | |
| else: | |
| line += "-" * gap | |
| start = end | |
| try: | |
| line = "".join(line[col] for col in cols) | |
| except IndexError: | |
| raise | |
| except Exception: | |
| raise TypeError( | |
| "second index must be an integer, slice, or iterable of integers" | |
| ) from None | |
| else: | |
| line = [] | |
| start = coordinate[0] | |
| for end, gap in zip(coordinate[1:], gaps): | |
| if start < end: | |
| line.extend(sequence[start:end]) | |
| else: | |
| line.extend([None] * gap) | |
| start = end | |
| try: | |
| line = [line[col] for col in cols] | |
| except IndexError: | |
| raise | |
| except Exception: | |
| raise TypeError( | |
| "second index must be an integer, slice, or iterable of integers" | |
| ) from None | |
| return line | |
| def _get_rows_col(self, coordinates, col, steps, gaps, sequences): | |
| """Return the alignment contents of multiple rows and one column (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[rows, col] | |
| where rows is a slice object, and col is an integer. | |
| Return value is a string. | |
| """ | |
| indices = gaps.cumsum() | |
| j = indices.searchsorted(col, side="right") | |
| offset = indices[j] - col | |
| line = "" | |
| for sequence, coordinate, step in zip(sequences, coordinates, steps): | |
| if step[j] == 0: | |
| line += "-" | |
| else: | |
| index = coordinate[j] + step[j] - offset | |
| line += sequence[index] | |
| return line | |
| def _get_rows_cols_slice( | |
| self, coordinates, row, start_index, stop_index, steps, gaps | |
| ): | |
| """Return a subalignment of multiple rows and consecutive columns (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[rows, cols] | |
| where rows is an arbitrary slice object, and cols is a slice object | |
| with step 1, allowing the alignment sequences to be reused in the | |
| subalignment. Return value is an Alignment object. | |
| """ | |
| rcs = numpy.any(coordinates != self.coordinates[row], axis=1) | |
| indices = gaps.cumsum() | |
| i = indices.searchsorted(start_index, side="right") | |
| j = i + indices[i:].searchsorted(stop_index, side="left") + 1 | |
| offset = steps[:, i] - indices[i] + start_index | |
| coordinates[:, i] += offset * (steps[:, i] > 0) | |
| offset = indices[j - 1] - stop_index | |
| coordinates[:, j] -= offset * (steps[:, j - 1] > 0) | |
| coordinates = coordinates[:, i : j + 1] | |
| sequences = self.sequences[row] | |
| for coordinate, rc, sequence in zip(coordinates, rcs, sequences): | |
| if rc: | |
| # mapped to reverse strand | |
| coordinate[:] = len(sequence) - coordinate[:] | |
| alignment = Alignment(sequences, coordinates) | |
| if numpy.array_equal(self.coordinates, coordinates): | |
| try: | |
| alignment.score = self.score | |
| except AttributeError: | |
| pass | |
| try: | |
| column_annotations = self.column_annotations | |
| except AttributeError: | |
| pass | |
| else: | |
| alignment.column_annotations = {} | |
| for key, value in column_annotations.items(): | |
| value = value[start_index:stop_index] | |
| try: | |
| value = value.copy() | |
| except AttributeError: | |
| # immutable tuples like str, tuple | |
| pass | |
| alignment.column_annotations[key] = value | |
| return alignment | |
| def _get_rows_cols_iterable(self, coordinates, col, steps, gaps, sequences): | |
| """Return a subalignment of multiple rows and columns (PRIVATE). | |
| This method is called by __getitem__ for invocations of the form | |
| self[rows, cols] | |
| where rows is a slice object and cols is an iterable of integers. | |
| This method will create new sequences for use by the subalignment | |
| object. Return value is an Alignment object. | |
| """ | |
| indices = tuple(col) | |
| lines = [] | |
| for i, sequence in enumerate(sequences): | |
| try: | |
| s = sequence.seq # stupid SeqRecord | |
| except AttributeError: | |
| s = sequence | |
| line = "" | |
| k = coordinates[i, 0] | |
| for step, gap in zip(steps[i], gaps): | |
| if step: | |
| j = k + step | |
| line += str(s[k:j]) | |
| k = j | |
| else: | |
| line += "-" * gap | |
| try: | |
| line = "".join(line[index] for index in indices) | |
| except IndexError: | |
| raise | |
| except Exception: | |
| raise TypeError( | |
| "second index must be an integer, slice, or iterable of integers" | |
| ) from None | |
| lines.append(line) | |
| line = line.replace("-", "") | |
| s = s.__class__(line) | |
| try: | |
| sequence.seq # stupid SeqRecord | |
| except AttributeError: | |
| sequence = s | |
| else: | |
| sequence = copy.deepcopy(sequence) | |
| sequence.seq = s | |
| sequences[i] = sequence | |
| coordinates = self.infer_coordinates(lines) | |
| alignment = Alignment(sequences, coordinates) | |
| try: | |
| column_annotations = self.column_annotations | |
| except AttributeError: | |
| pass | |
| else: | |
| alignment.column_annotations = {} | |
| for key, value in column_annotations.items(): | |
| values = (value[index] for index in indices) | |
| if isinstance(value, str): | |
| value = "".join(values) | |
| else: | |
| value = value.__class__(values) | |
| alignment.column_annotations[key] = value | |
| return alignment | |
| def __getitem__(self, key): | |
| """Return self[key]. | |
| Indices of the form | |
| self[:, :] | |
| return a copy of the Alignment object; | |
| self[:, i:] | |
| self[:, :j] | |
| self[:, i:j] | |
| self[:, iterable] (where iterable returns integers) | |
| return a new Alignment object spanning the selected columns; | |
| self[k, i] | |
| self[k, i:] | |
| self[k, :j] | |
| self[k, i:j] | |
| self[k, iterable] (where iterable returns integers) | |
| self[k] (equivalent to self[k, :]) | |
| return a string with the aligned sequence (including gaps) for the | |
| selected columns, where k = 0 represents the target and k = 1 | |
| represents the query sequence; and | |
| self[:, i] | |
| returns a string with the selected column in the alignment. | |
| >>> from Bio.Align import PairwiseAligner | |
| >>> aligner = PairwiseAligner() | |
| >>> alignments = aligner.align("ACCGGTTT", "ACGGGTT") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 ACCGG-TTT 8 | |
| 0 ||-||-||- 9 | |
| query 0 AC-GGGTT- 7 | |
| <BLANKLINE> | |
| >>> alignment[0, :] | |
| 'ACCGG-TTT' | |
| >>> alignment[1, :] | |
| 'AC-GGGTT-' | |
| >>> alignment[0] | |
| 'ACCGG-TTT' | |
| >>> alignment[1] | |
| 'AC-GGGTT-' | |
| >>> alignment[0, 1:-2] | |
| 'CCGG-T' | |
| >>> alignment[1, 1:-2] | |
| 'C-GGGT' | |
| >>> alignment[0, (1, 5, 2)] | |
| 'C-C' | |
| >>> alignment[1, ::2] | |
| 'A-GT-' | |
| >>> alignment[1, range(0, 9, 2)] | |
| 'A-GT-' | |
| >>> alignment[:, 0] | |
| 'AA' | |
| >>> alignment[:, 5] | |
| '-G' | |
| >>> alignment[:, 1:] # doctest:+ELLIPSIS | |
| <Alignment object (2 rows x 8 columns) at 0x...> | |
| >>> print(alignment[:, 1:]) | |
| target 1 CCGG-TTT 8 | |
| 0 |-||-||- 8 | |
| query 1 C-GGGTT- 7 | |
| <BLANKLINE> | |
| >>> print(alignment[:, 2:]) | |
| target 2 CGG-TTT 8 | |
| 0 -||-||- 7 | |
| query 2 -GGGTT- 7 | |
| <BLANKLINE> | |
| >>> print(alignment[:, 3:]) | |
| target 3 GG-TTT 8 | |
| 0 ||-||- 6 | |
| query 2 GGGTT- 7 | |
| <BLANKLINE> | |
| >>> print(alignment[:, 3:-1]) | |
| target 3 GG-TT 7 | |
| 0 ||-|| 5 | |
| query 2 GGGTT 7 | |
| <BLANKLINE> | |
| >>> print(alignment[:, ::2]) | |
| target 0 ACGTT 5 | |
| 0 |-||- 5 | |
| query 0 A-GT- 3 | |
| <BLANKLINE> | |
| >>> print(alignment[:, range(1, 9, 2)]) | |
| target 0 CG-T 3 | |
| 0 ||-| 4 | |
| query 0 CGGT 4 | |
| <BLANKLINE> | |
| >>> print(alignment[:, (2, 7, 3)]) | |
| target 0 CTG 3 | |
| 0 -|| 3 | |
| query 0 -TG 2 | |
| <BLANKLINE> | |
| """ | |
| if isinstance(key, numbers.Integral): | |
| return self._get_row(key) | |
| if isinstance(key, slice): | |
| return self._get_rows(key) | |
| sequences = list(self.sequences) | |
| coordinates = self.coordinates.copy() | |
| steps = numpy.diff(coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| for i, sequence in enumerate(sequences): | |
| row = steps[i, aligned] | |
| if (row >= 0).all(): | |
| pass | |
| elif (row <= 0).all(): | |
| steps[i, :] = -steps[i, :] | |
| coordinates[i, :] = len(sequence) - coordinates[i, :] | |
| sequences[i] = reverse_complement(sequence, inplace=False) | |
| try: | |
| sequences[i].id = sequence.id | |
| except AttributeError: | |
| pass | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| gaps = steps.max(0) | |
| if not ((steps == gaps) | (steps <= 0)).all(): | |
| raise ValueError("Unequal step sizes in alignment") | |
| m = sum(gaps) | |
| if isinstance(key, tuple): | |
| try: | |
| row, col = key | |
| except ValueError: | |
| raise ValueError("only tuples of length 2 can be alignment indices") | |
| else: | |
| raise TypeError("alignment indices must be integers, slices, or tuples") | |
| if isinstance(col, numbers.Integral): | |
| if col < 0: | |
| col += m | |
| if col < 0 or col >= m: | |
| raise IndexError( | |
| "column index %d is out of bounds (%d columns)" % (col, m) | |
| ) | |
| steps = steps[row] | |
| if isinstance(row, numbers.Integral): | |
| sequence = sequences[row] | |
| if isinstance(col, numbers.Integral): | |
| return self._get_row_col( | |
| coordinates[row, 0], col, steps, gaps, sequence | |
| ) | |
| coordinate = coordinates[row, :] | |
| if isinstance(col, slice): | |
| start_index, stop_index, step = col.indices(m) | |
| if start_index < stop_index and step == 1: | |
| return self._get_row_cols_slice( | |
| coordinate, start_index, stop_index, steps, gaps, sequence | |
| ) | |
| # make an iterable if step != 1 | |
| col = range(start_index, stop_index, step) | |
| return self._get_row_cols_iterable(coordinate, col, gaps, sequence) | |
| if isinstance(row, slice): | |
| sequences = sequences[row] | |
| coordinates = coordinates[row] | |
| if isinstance(col, numbers.Integral): | |
| return self._get_rows_col(coordinates, col, steps, gaps, sequences) | |
| if isinstance(col, slice): | |
| start_index, stop_index, step = col.indices(m) | |
| if start_index < stop_index and step == 1: | |
| return self._get_rows_cols_slice( | |
| coordinates, | |
| row, | |
| start_index, | |
| stop_index, | |
| steps, | |
| gaps, | |
| ) | |
| # make an iterable if step != 1 | |
| col = range(start_index, stop_index, step) | |
| # try if we can use col as an iterable | |
| return self._get_rows_cols_iterable( | |
| coordinates, col, steps, gaps, sequences | |
| ) | |
| raise TypeError("first index must be an integer or slice") | |
| def _convert_sequence_string(self, sequence): | |
| """Convert given sequence to string using the appropriate method (PRIVATE).""" | |
| if isinstance(sequence, (bytes, bytearray)): | |
| return sequence.decode() | |
| if isinstance(sequence, str): | |
| return sequence | |
| if isinstance(sequence, Seq): | |
| return str(sequence) | |
| try: # check if target is a SeqRecord | |
| sequence = sequence.seq | |
| except AttributeError: | |
| pass | |
| else: | |
| return str(sequence) | |
| try: | |
| view = memoryview(sequence) | |
| except TypeError: | |
| pass | |
| else: | |
| if view.format == "c": | |
| return str(sequence) | |
| return None | |
| def __format__(self, format_spec): | |
| """Return the alignment as a string in the specified file format. | |
| Wrapper for self.format(). | |
| """ | |
| return self.format(format_spec) | |
| def format(self, fmt="", *args, **kwargs): | |
| """Return the alignment as a string in the specified file format. | |
| Arguments: | |
| - fmt - File format. Acceptable values are an empty string to | |
| create a human-readable representation of the alignment, | |
| or any of the alignment file formats supported by | |
| `Bio.Align` (some have not yet been implemented). | |
| All other arguments are passed to the format-specific writer functions: | |
| - mask - PSL format only. Specify if repeat regions in the target | |
| sequence are masked and should be reported in the | |
| `repMatches` field of the PSL file instead of in the | |
| `matches` field. Acceptable values are | |
| None : no masking (default); | |
| "lower": masking by lower-case characters; | |
| "upper": masking by upper-case characters. | |
| - wildcard - PSL format only. Report alignments to the wildcard | |
| character in the target or query sequence in the | |
| `nCount` field of the PSL file instead of in the | |
| `matches`, `misMatches`, or `repMatches` fields. | |
| Default value is 'N'. | |
| - md - SAM format only. If True, calculate the MD tag from | |
| the alignment and include it in the output. If False | |
| (default), do not include the MD tag in the output. | |
| """ | |
| if fmt == "": | |
| return self._format_pretty() | |
| module = _load(fmt) | |
| try: | |
| writer = module.AlignmentWriter(None, *args, **kwargs) | |
| except AttributeError: | |
| if module.AlignmentIterator.mode == "b": | |
| raise ValueError(f"{fmt} is a binary file format") | |
| raise ValueError( | |
| f"Formatting alignments has not yet been implemented for the {fmt} format" | |
| ) from None | |
| return writer.format_alignment(self) | |
| def _format_pretty(self): | |
| """Return default string representation (PRIVATE). | |
| Helper for self.format(). | |
| """ | |
| n = len(self.sequences) | |
| if n == 2: | |
| write_pattern = True | |
| else: | |
| write_pattern = False | |
| steps = numpy.diff(self.coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| name_width = 10 | |
| names = [] | |
| seqs = [] | |
| indices = numpy.zeros(self.coordinates.shape, int) | |
| for i, (seq, positions, row) in enumerate( | |
| zip(self.sequences, self.coordinates, indices) | |
| ): | |
| try: | |
| name = seq.id | |
| if name is None: | |
| raise AttributeError | |
| except AttributeError: | |
| if n == 2: | |
| if i == 0: | |
| name = "target" | |
| else: | |
| name = "query" | |
| else: | |
| name = "" | |
| else: | |
| name = name[: name_width - 1] | |
| name = name.ljust(name_width) | |
| names.append(name) | |
| try: | |
| seq = seq.seq # SeqRecord confusion | |
| except AttributeError: | |
| pass | |
| start = min(positions) | |
| end = max(positions) | |
| seq = seq[start:end] | |
| aligned_steps = steps[i, aligned] | |
| if len(aligned_steps) == 0: | |
| aligned_steps = steps[i] | |
| if (aligned_steps >= 0).all(): | |
| start = min(positions) | |
| row[:] = positions - start | |
| elif (aligned_steps <= 0).all(): | |
| steps[i, :] = -steps[i, :] | |
| seq = reverse_complement(seq, inplace=False) | |
| end = max(positions) | |
| row[:] = end - positions | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| if isinstance(seq, str): | |
| if not seq.isascii(): | |
| return self._format_unicode() | |
| elif isinstance(seq, (Seq, MutableSeq)): | |
| try: | |
| seq = bytes(seq) | |
| except UndefinedSequenceError: | |
| s = bytearray(b"?" * (end - start)) | |
| for start, end in seq.defined_ranges: | |
| s[start:end] = bytes(seq[start:end]) | |
| seq = s | |
| seq = seq.decode() | |
| else: | |
| return self._format_generalized() | |
| seqs.append(seq) | |
| minstep = steps.min(0) | |
| maxstep = steps.max(0) | |
| steps = numpy.where(-minstep > maxstep, minstep, maxstep) | |
| for i, row in enumerate(indices): | |
| row_steps = numpy.diff(row) | |
| row_aligned = (row_steps > 0) & aligned | |
| row_steps = row_steps[row_aligned] | |
| aligned_steps = steps[row_aligned] | |
| if (row_steps == aligned_steps).all(): | |
| pass | |
| elif (3 * row_steps == aligned_steps).all(): | |
| row[:] *= 3 | |
| seqs[i] = " ".join(seqs[i]) + " " | |
| write_pattern = False | |
| else: | |
| raise ValueError("Inconsistent coordinates") | |
| prefix_width = 10 | |
| position_width = 10 | |
| line_width = 80 | |
| lines = [] | |
| steps = indices[:, 1:] - indices[:, :-1] | |
| minstep = steps.min(0) | |
| maxstep = steps.max(0) | |
| steps = numpy.where(-minstep > maxstep, minstep, maxstep) | |
| for name, seq, positions, row in zip(names, seqs, self.coordinates, indices): | |
| start = positions[0] | |
| column = line_width | |
| start_index = row[0] | |
| for step, end, end_index in zip(steps, positions[1:], row[1:]): | |
| if step < 0: | |
| if prefix_width + position_width < column: | |
| position_text = str(start) | |
| offset = position_width - len(position_text) - 1 | |
| if offset < 0: | |
| lines[-1] += " .." + position_text[-offset + 3 :] | |
| else: | |
| lines[-1] += " " + position_text | |
| column = line_width | |
| start = end | |
| start_index = end_index | |
| continue | |
| elif end_index == start_index: | |
| s = "-" * step | |
| else: | |
| s = seq[start_index:end_index] | |
| while column + len(s) >= line_width: | |
| rest = line_width - column | |
| if rest > 0: | |
| lines[-1] += s[:rest] | |
| s = s[rest:] | |
| if start != end: | |
| if (end_index - start_index) == abs(end - start): | |
| step = rest | |
| else: | |
| # protein to dna alignment; | |
| # integer division, but round up: | |
| step = -(rest // -3) | |
| if start < end: | |
| start += step | |
| else: | |
| start -= step | |
| start_index += rest | |
| line = name | |
| position_text = str(start) | |
| offset = position_width - len(position_text) - 1 | |
| if offset < 0: | |
| line += " .." + position_text[-offset + 3 :] | |
| else: | |
| line += " " * offset + position_text | |
| line += " " | |
| lines.append(line) | |
| column = name_width + position_width | |
| lines[-1] += s | |
| if start_index != end_index: | |
| start_index = end_index | |
| start = end | |
| column += len(s) | |
| if write_pattern is True: | |
| dash = "-" | |
| position = 0 | |
| m = len(lines) // 2 | |
| lines1 = lines[:m] | |
| lines2 = lines[m:] | |
| pattern_lines = [] | |
| for line1, line2 in zip(lines1, lines2): | |
| aligned_seq1 = line1[name_width + position_width :] | |
| aligned_seq2 = line2[name_width + position_width :] | |
| pattern = "" | |
| for c1, c2 in zip(aligned_seq1, aligned_seq2): | |
| if c1 == c2: | |
| if c1 == " ": | |
| break | |
| c = "|" | |
| elif c1 == dash or c2 == dash: | |
| c = "-" | |
| else: | |
| c = "." | |
| pattern += c | |
| pattern_line = " %9d %s" % (position, pattern) | |
| pattern_lines.append(pattern_line) | |
| position += len(pattern) | |
| final_position_width = len(str(max(max(self.coordinates[:, -1]), position))) | |
| if column + final_position_width <= line_width: | |
| if prefix_width + position_width < column: | |
| fmt = f" %{final_position_width}d" | |
| lines1[-1] += fmt % self.coordinates[0, -1] | |
| lines2[-1] += fmt % self.coordinates[1, -1] | |
| pattern_lines[-1] += fmt % position | |
| else: | |
| name1, name2 = names | |
| fmt = "%s%9d" | |
| line = name1 + format(self.coordinates[0, -1], "9d") | |
| lines1.append(line) | |
| line = fmt % (" ", position) | |
| pattern_lines.append(line) | |
| line = fmt % (name2, self.coordinates[1, -1]) | |
| lines2.append(line) | |
| lines.append("") | |
| return "\n".join( | |
| f"{line1}\n{pattern_line}\n{line2}\n" | |
| for (line1, line2, pattern_line) in zip(lines1, lines2, pattern_lines) | |
| ) | |
| else: | |
| m = len(lines) // n | |
| final_position_width = len(str(max(self.coordinates[:, -1]))) | |
| if column + final_position_width < line_width: | |
| if prefix_width + position_width < column: | |
| fmt = f" %{final_position_width}d" | |
| for i in range(n): | |
| lines[m - 1 + i * m] += fmt % self.coordinates[i, -1] | |
| blocks = ["\n".join(lines[j::m]) + "\n" for j in range(m)] | |
| else: | |
| blocks = ["\n".join(lines[j::m]) + "\n" for j in range(m)] | |
| lines = [] | |
| fmt = "%s%9d" | |
| for i in range(n): | |
| line = names[i] + format(self.coordinates[i, -1], "9d") | |
| lines.append(line) | |
| block = "\n".join(lines) + "\n" | |
| blocks.append(block) | |
| return "\n".join(blocks) | |
| def _format_unicode(self): | |
| """Return default string representation (PRIVATE). | |
| Helper for self.format(). | |
| """ | |
| seqs = [] | |
| names = [] | |
| coordinates = self.coordinates.copy() | |
| for seq, row in zip(self.sequences, coordinates): | |
| seq = self._convert_sequence_string(seq) | |
| if seq is None: | |
| return self._format_generalized() | |
| if row[0] > row[-1]: # mapped to reverse strand | |
| row[:] = len(seq) - row[:] | |
| seq = reverse_complement(seq, inplace=False) | |
| seqs.append(seq) | |
| try: | |
| name = seq.id | |
| except AttributeError: | |
| if len(self.sequences) == 2: | |
| if len(names) == 0: | |
| name = "target" | |
| else: | |
| name = "query" | |
| else: | |
| name = "" | |
| else: | |
| name = name[:9] | |
| name = name.ljust(10) | |
| names.append(name) | |
| steps = numpy.diff(coordinates, 1).max(0) | |
| aligned_seqs = [] | |
| for row, seq in zip(coordinates, seqs): | |
| aligned_seq = "" | |
| start = row[0] | |
| for step, end in zip(steps, row[1:]): | |
| if end == start: | |
| aligned_seq += "-" * step | |
| else: | |
| aligned_seq += seq[start:end] | |
| start = end | |
| aligned_seqs.append(aligned_seq) | |
| if len(seqs) > 2: | |
| return "\n".join(aligned_seqs) + "\n" | |
| aligned_seq1, aligned_seq2 = aligned_seqs | |
| pattern = "" | |
| for c1, c2 in zip(aligned_seq1, aligned_seq2): | |
| if c1 == c2: | |
| c = "|" | |
| elif c1 == "-" or c2 == "-": | |
| c = "-" | |
| else: | |
| c = "." | |
| pattern += c | |
| return f"{aligned_seq1}\n{pattern}\n{aligned_seq2}\n" | |
| def _format_generalized(self): | |
| """Return generalized string representation (PRIVATE). | |
| Helper for self._format_pretty(). | |
| """ | |
| seq1, seq2 = self.sequences | |
| aligned_seq1 = [] | |
| aligned_seq2 = [] | |
| pattern = [] | |
| end1, end2 = self.coordinates[:, 0] | |
| if end1 > 0 or end2 > 0: | |
| if end1 <= end2: | |
| for c2 in seq2[: end2 - end1]: | |
| s2 = str(c2) | |
| s1 = " " * len(s2) | |
| aligned_seq1.append(s1) | |
| aligned_seq2.append(s2) | |
| pattern.append(s1) | |
| else: # end1 > end2 | |
| for c1 in seq1[: end1 - end2]: | |
| s1 = str(c1) | |
| s2 = " " * len(s1) | |
| aligned_seq1.append(s1) | |
| aligned_seq2.append(s2) | |
| pattern.append(s2) | |
| start1 = end1 | |
| start2 = end2 | |
| for end1, end2 in self.coordinates[:, 1:].transpose(): | |
| if end1 == start1: | |
| for c2 in seq2[start2:end2]: | |
| s2 = str(c2) | |
| s1 = "-" * len(s2) | |
| aligned_seq1.append(s1) | |
| aligned_seq2.append(s2) | |
| pattern.append(s1) | |
| start2 = end2 | |
| elif end2 == start2: | |
| for c1 in seq1[start1:end1]: | |
| s1 = str(c1) | |
| s2 = "-" * len(s1) | |
| aligned_seq1.append(s1) | |
| aligned_seq2.append(s2) | |
| pattern.append(s2) | |
| start1 = end1 | |
| else: | |
| t1 = seq1[start1:end1] | |
| t2 = seq2[start2:end2] | |
| if len(t1) != len(t2): | |
| raise ValueError("Unequal step sizes in alignment") | |
| for c1, c2 in zip(t1, t2): | |
| s1 = str(c1) | |
| s2 = str(c2) | |
| m1 = len(s1) | |
| m2 = len(s2) | |
| if c1 == c2: | |
| p = "|" | |
| else: | |
| p = "." | |
| if m1 < m2: | |
| space = (m2 - m1) * " " | |
| s1 += space | |
| pattern.append(p * m1 + space) | |
| elif m1 > m2: | |
| space = (m1 - m2) * " " | |
| s2 += space | |
| pattern.append(p * m2 + space) | |
| else: | |
| pattern.append(p * m1) | |
| aligned_seq1.append(s1) | |
| aligned_seq2.append(s2) | |
| start1 = end1 | |
| start2 = end2 | |
| aligned_seq1 = " ".join(aligned_seq1) | |
| aligned_seq2 = " ".join(aligned_seq2) | |
| pattern = " ".join(pattern) | |
| return f"{aligned_seq1}\n{pattern}\n{aligned_seq2}\n" | |
| def __str__(self): | |
| """Return a human-readable string representation of the alignment. | |
| For sequence alignments, each line has at most 80 columns. | |
| The first 10 columns show the (possibly truncated) sequence name, | |
| which may be the id attribute of a SeqRecord, or otherwise 'target' | |
| or 'query' for pairwise alignments. | |
| The next 10 columns show the sequence coordinate, using zero-based | |
| counting as usual in Python. | |
| The remaining 60 columns shown the sequence, using dashes to represent | |
| gaps. | |
| At the end of the alignment, the end coordinates are shown on the right | |
| of the sequence, again in zero-based coordinates. | |
| Pairwise alignments have an additional line between the two sequences | |
| showing whether the sequences match ('|') or mismatch ('.'), or if | |
| there is a gap ('-'). | |
| The coordinates shown for this line are the column indices, which can | |
| be useful when extracting a subalignment. | |
| For example, | |
| >>> from Bio.Align import PairwiseAligner | |
| >>> aligner = PairwiseAligner() | |
| >>> seqA = "TTAACCCCATTTG" | |
| >>> seqB = "AAGCCCCTTT" | |
| >>> seqC = "AAAGGGGCTT" | |
| >>> alignments = aligner.align(seqA, seqB) | |
| >>> len(alignments) | |
| 1 | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 TTAA-CCCCATTTG 13 | |
| 0 --||-||||-|||- 14 | |
| query 0 --AAGCCCC-TTT- 10 | |
| <BLANKLINE> | |
| Note that seqC is the reverse complement of seqB. Aligning it to the | |
| reverse strand gives the same alignment, but the query coordinates are | |
| switched: | |
| >>> alignments = aligner.align(seqA, seqC, strand="-") | |
| >>> len(alignments) | |
| 1 | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 TTAA-CCCCATTTG 13 | |
| 0 --||-||||-|||- 14 | |
| query 10 --AAGCCCC-TTT- 0 | |
| <BLANKLINE> | |
| """ | |
| return self.format() | |
| def __repr__(self): | |
| """Return a representation of the alignment, including its shape. | |
| The representation cannot be used with eval() to recreate the object, | |
| which is usually possible with simple python objects. For example: | |
| <Alignment object (2 rows x 14 columns) at 0x10403d850> | |
| The hex string is the memory address of the object and can be used to | |
| distinguish different Alignment objects. See help(id) for more | |
| information. | |
| >>> import numpy | |
| >>> from Bio.Align import Alignment | |
| >>> alignment = Alignment(("ACCGT", "ACGT"), | |
| ... coordinates = numpy.array([[0, 2, 3, 5], | |
| ... [0, 2, 2, 4], | |
| ... ])) | |
| >>> print(alignment) | |
| target 0 ACCGT 5 | |
| 0 ||-|| 5 | |
| query 0 AC-GT 4 | |
| <BLANKLINE> | |
| >>> alignment # doctest:+ELLIPSIS | |
| <Alignment object (2 rows x 5 columns) at 0x...> | |
| """ | |
| if self.coordinates is None: | |
| return "<%s object at 0x%x>" % ( | |
| self.__class__.__name__, | |
| id(self), | |
| ) | |
| n, m = self.shape | |
| return "<%s object (%i rows x %i columns) at 0x%x>" % ( | |
| self.__class__.__name__, | |
| n, | |
| m, | |
| id(self), | |
| ) | |
| def __len__(self): | |
| """Return the number of sequences in the alignment.""" | |
| return len(self.sequences) | |
| def shape(self): | |
| """Return the shape of the alignment as a tuple of two integer values. | |
| The first integer value is the number of sequences in the alignment as | |
| returned by len(alignment), which is always 2 for pairwise alignments. | |
| The second integer value is the number of columns in the alignment when | |
| it is printed, and is equal to the sum of the number of matches, number | |
| of mismatches, and the total length of gaps in the target and query. | |
| Sequence sections beyond the aligned segment are not included in the | |
| number of columns. | |
| For example, | |
| >>> from Bio import Align | |
| >>> aligner = Align.PairwiseAligner() | |
| >>> aligner.mode = "global" | |
| >>> alignments = aligner.align("GACCTG", "CGATCG") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 -GACCT-G 6 | |
| 0 -||--|-| 8 | |
| query 0 CGA--TCG 6 | |
| <BLANKLINE> | |
| >>> len(alignment) | |
| 2 | |
| >>> alignment.shape | |
| (2, 8) | |
| >>> aligner.mode = "local" | |
| >>> alignments = aligner.align("GACCTG", "CGATCG") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 GACCT-G 6 | |
| 0 ||--|-| 7 | |
| query 1 GA--TCG 6 | |
| <BLANKLINE> | |
| >>> len(alignment) | |
| 2 | |
| >>> alignment.shape | |
| (2, 7) | |
| """ | |
| n = len(self.coordinates) | |
| if n == 0: # no sequences | |
| return (0, 0) | |
| steps = numpy.diff(self.coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| for i in range(n): | |
| row = steps[i, aligned] | |
| if (row >= 0).all(): | |
| pass | |
| elif (row <= 0).all(): | |
| steps[i, :] = -steps[i, :] | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| gaps = steps.max(0) | |
| if not ((steps == gaps) | (steps <= 0)).all(): | |
| raise ValueError("Unequal step sizes in alignment") | |
| m = sum(gaps) | |
| return (n, m) | |
| def aligned(self): | |
| """Return the indices of subsequences aligned to each other. | |
| This property returns the start and end indices of subsequences | |
| in the target and query sequence that were aligned to each other. | |
| If the alignment between target (t) and query (q) consists of N | |
| chunks, you get two tuples of length N: | |
| (((t_start1, t_end1), (t_start2, t_end2), ..., (t_startN, t_endN)), | |
| ((q_start1, q_end1), (q_start2, q_end2), ..., (q_startN, q_endN))) | |
| For example, | |
| >>> from Bio import Align | |
| >>> aligner = Align.PairwiseAligner() | |
| >>> alignments = aligner.align("GAACT", "GAT") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 GAACT 5 | |
| 0 ||--| 5 | |
| query 0 GA--T 3 | |
| <BLANKLINE> | |
| >>> alignment.aligned | |
| array([[[0, 2], | |
| [4, 5]], | |
| <BLANKLINE> | |
| [[0, 2], | |
| [2, 3]]]) | |
| >>> alignment = alignments[1] | |
| >>> print(alignment) | |
| target 0 GAACT 5 | |
| 0 |-|-| 5 | |
| query 0 G-A-T 3 | |
| <BLANKLINE> | |
| >>> alignment.aligned | |
| array([[[0, 1], | |
| [2, 3], | |
| [4, 5]], | |
| <BLANKLINE> | |
| [[0, 1], | |
| [1, 2], | |
| [2, 3]]]) | |
| Note that different alignments may have the same subsequences | |
| aligned to each other. In particular, this may occur if alignments | |
| differ from each other in terms of their gap placement only: | |
| >>> aligner.mismatch_score = -10 | |
| >>> alignments = aligner.align("AAACAAA", "AAAGAAA") | |
| >>> len(alignments) | |
| 2 | |
| >>> print(alignments[0]) | |
| target 0 AAAC-AAA 7 | |
| 0 |||--||| 8 | |
| query 0 AAA-GAAA 7 | |
| <BLANKLINE> | |
| >>> alignments[0].aligned | |
| array([[[0, 3], | |
| [4, 7]], | |
| <BLANKLINE> | |
| [[0, 3], | |
| [4, 7]]]) | |
| >>> print(alignments[1]) | |
| target 0 AAA-CAAA 7 | |
| 0 |||--||| 8 | |
| query 0 AAAG-AAA 7 | |
| <BLANKLINE> | |
| >>> alignments[1].aligned | |
| array([[[0, 3], | |
| [4, 7]], | |
| <BLANKLINE> | |
| [[0, 3], | |
| [4, 7]]]) | |
| The property can be used to identify alignments that are identical | |
| to each other in terms of their aligned sequences. | |
| """ | |
| if len(self.sequences) > 2: | |
| raise NotImplementedError( | |
| "aligned is currently implemented for pairwise alignments only" | |
| ) | |
| coordinates = self.coordinates.copy() | |
| steps = numpy.diff(coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| for i, sequence in enumerate(self.sequences): | |
| row = steps[i, aligned] | |
| if (row >= 0).all(): | |
| pass | |
| elif (row <= 0).all(): | |
| steps[i, :] = -steps[i, :] | |
| coordinates[i, :] = len(sequence) - coordinates[i, :] | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| coordinates = coordinates.transpose() | |
| steps = numpy.diff(coordinates, axis=0) | |
| steps = abs(steps).min(1) | |
| indices = numpy.flatnonzero(steps) | |
| starts = coordinates[indices, :] | |
| ends = coordinates[indices + 1, :] | |
| segments = numpy.stack([starts, ends], axis=0).transpose() | |
| steps = numpy.diff(self.coordinates, 1) | |
| for i, sequence in enumerate(self.sequences): | |
| row = steps[i, aligned] | |
| if (row >= 0).all(): | |
| pass | |
| elif (row <= 0).all(): # mapped to reverse strand | |
| segments[i, :] = len(sequence) - segments[i, :] | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| return segments | |
| def indices(self): | |
| """Return the sequence index of each lettter in the alignment. | |
| This property returns a 2D numpy array with the sequence index of each | |
| letter in the alignment. Gaps are indicated by -1. The array has the | |
| same number of rows and columns as the alignment, as given by | |
| `self.shape`. | |
| For example, | |
| >>> from Bio import Align | |
| >>> aligner = Align.PairwiseAligner() | |
| >>> aligner.mode = "local" | |
| >>> alignments = aligner.align("GAACTGG", "AATG") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 1 AACTG 6 | |
| 0 ||-|| 5 | |
| query 0 AA-TG 4 | |
| <BLANKLINE> | |
| >>> alignment.indices | |
| array([[ 1, 2, 3, 4, 5], | |
| [ 0, 1, -1, 2, 3]]) | |
| >>> alignment = alignments[1] | |
| >>> print(alignment) | |
| target 1 AACTGG 7 | |
| 0 ||-|-| 6 | |
| query 0 AA-T-G 4 | |
| <BLANKLINE> | |
| >>> alignment.indices | |
| array([[ 1, 2, 3, 4, 5, 6], | |
| [ 0, 1, -1, 2, -1, 3]]) | |
| >>> alignments = aligner.align("GAACTGG", "CATT", strand="-") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 1 AACTG 6 | |
| 0 ||-|| 5 | |
| query 4 AA-TG 0 | |
| <BLANKLINE> | |
| >>> alignment.indices | |
| array([[ 1, 2, 3, 4, 5], | |
| [ 3, 2, -1, 1, 0]]) | |
| >>> alignment = alignments[1] | |
| >>> print(alignment) | |
| target 1 AACTGG 7 | |
| 0 ||-|-| 6 | |
| query 4 AA-T-G 0 | |
| <BLANKLINE> | |
| >>> alignment.indices | |
| array([[ 1, 2, 3, 4, 5, 6], | |
| [ 3, 2, -1, 1, -1, 0]]) | |
| """ | |
| a = -numpy.ones(self.shape, int) | |
| n, m = self.coordinates.shape | |
| steps = numpy.diff(self.coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| steps = steps[:, aligned] | |
| rcs = numpy.zeros(n, bool) | |
| for i, row in enumerate(steps): | |
| if (row >= 0).all(): | |
| rcs[i] = False | |
| elif (row <= 0).all(): | |
| rcs[i] = True | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| i = 0 | |
| j = 0 | |
| ends = self.coordinates[:, 0] | |
| for k in range(1, m): | |
| starts = ends | |
| ends = self.coordinates[:, k] | |
| for row, start, end, rc in zip(a, starts, ends, rcs): | |
| if rc == False and start < end: # noqa: 712 | |
| j = i + end - start | |
| row[i:j] = range(start, end) | |
| elif rc == True and start > end: # noqa: 712 | |
| j = i + start - end | |
| row[i:j] = range(start - 1, end - 1, -1) | |
| i = j | |
| return a | |
| def inverse_indices(self): | |
| """Return the alignment column index for each letter in each sequence. | |
| This property returns a list of 1D numpy arrays; the number of arrays | |
| is equal to the number of aligned sequences, and the length of each | |
| array is equal to the length of the corresponding sequence. For each | |
| letter in each sequence, the array contains the corresponding column | |
| index in the alignment. Letters not included in the alignment are | |
| indicated by -1. | |
| For example, | |
| >>> from Bio import Align | |
| >>> aligner = Align.PairwiseAligner() | |
| >>> aligner.mode = "local" | |
| >>> alignments = aligner.align("GAACTGG", "AATG") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 1 AACTG 6 | |
| 0 ||-|| 5 | |
| query 0 AA-TG 4 | |
| <BLANKLINE> | |
| >>> alignment.inverse_indices | |
| [array([-1, 0, 1, 2, 3, 4, -1]), array([0, 1, 3, 4])] | |
| >>> alignment = alignments[1] | |
| >>> print(alignment) | |
| target 1 AACTGG 7 | |
| 0 ||-|-| 6 | |
| query 0 AA-T-G 4 | |
| <BLANKLINE> | |
| >>> alignment.inverse_indices | |
| [array([-1, 0, 1, 2, 3, 4, 5]), array([0, 1, 3, 5])] | |
| >>> alignments = aligner.align("GAACTGG", "CATT", strand="-") | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 1 AACTG 6 | |
| 0 ||-|| 5 | |
| query 4 AA-TG 0 | |
| <BLANKLINE> | |
| >>> alignment.inverse_indices | |
| [array([-1, 0, 1, 2, 3, 4, -1]), array([4, 3, 1, 0])] | |
| >>> alignment = alignments[1] | |
| >>> print(alignment) | |
| target 1 AACTGG 7 | |
| 0 ||-|-| 6 | |
| query 4 AA-T-G 0 | |
| <BLANKLINE> | |
| >>> alignment.inverse_indices | |
| [array([-1, 0, 1, 2, 3, 4, 5]), array([5, 3, 1, 0])] | |
| """ | |
| a = [-numpy.ones(len(sequence), int) for sequence in self.sequences] | |
| n, m = self.coordinates.shape | |
| steps = numpy.diff(self.coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| steps = steps[:, aligned] | |
| rcs = numpy.zeros(n, bool) | |
| for i, row in enumerate(steps): | |
| if (row >= 0).all(): | |
| rcs[i] = False | |
| elif (row <= 0).all(): | |
| rcs[i] = True | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| i = 0 | |
| j = 0 | |
| for k in range(m - 1): | |
| starts = self.coordinates[:, k] | |
| ends = self.coordinates[:, k + 1] | |
| for row, start, end, rc in zip(a, starts, ends, rcs): | |
| if rc == False and start < end: # noqa: 712 | |
| j = i + end - start | |
| row[start:end] = range(i, j) | |
| elif rc == True and start > end: # noqa: 712 | |
| j = i + start - end | |
| if end > 0: | |
| row[start - 1 : end - 1 : -1] = range(i, j) | |
| elif start > 0: | |
| row[start - 1 :: -1] = range(i, j) | |
| i = j | |
| return a | |
| def sort(self, key=None, reverse=False): | |
| """Sort the sequences of the alignment in place. | |
| By default, this sorts the sequences alphabetically using their id | |
| attribute if available, or by their sequence contents otherwise. | |
| For example, | |
| >>> from Bio.Align import PairwiseAligner | |
| >>> aligner = PairwiseAligner() | |
| >>> aligner.gap_score = -1 | |
| >>> alignments = aligner.align("AATAA", "AAGAA") | |
| >>> len(alignments) | |
| 1 | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 AATAA 5 | |
| 0 ||.|| 5 | |
| query 0 AAGAA 5 | |
| <BLANKLINE> | |
| >>> alignment.sort() | |
| >>> print(alignment) | |
| target 0 AAGAA 5 | |
| 0 ||.|| 5 | |
| query 0 AATAA 5 | |
| <BLANKLINE> | |
| Alternatively, a key function can be supplied that maps each sequence | |
| to a sort value. For example, you could sort on the GC content of each | |
| sequence. | |
| >>> from Bio.SeqUtils import gc_fraction | |
| >>> alignment.sort(key=gc_fraction) | |
| >>> print(alignment) | |
| target 0 AATAA 5 | |
| 0 ||.|| 5 | |
| query 0 AAGAA 5 | |
| <BLANKLINE> | |
| You can reverse the sort order by passing `reverse=True`: | |
| >>> alignment.sort(key=gc_fraction, reverse=True) | |
| >>> print(alignment) | |
| target 0 AAGAA 5 | |
| 0 ||.|| 5 | |
| query 0 AATAA 5 | |
| <BLANKLINE> | |
| The sequences are now sorted by decreasing GC content value. | |
| """ | |
| sequences = self.sequences | |
| if key is None: | |
| try: | |
| values = [sequence.id for sequence in sequences] | |
| except AttributeError: | |
| values = sequences | |
| else: | |
| values = [key(sequence) for sequence in sequences] | |
| indices = sorted(range(len(sequences)), key=values.__getitem__, reverse=reverse) | |
| self.sequences = [sequences[index] for index in indices] | |
| self.coordinates = self.coordinates.take(indices, 0) | |
| def map(self, alignment): | |
| r"""Map the alignment to self.target and return the resulting alignment. | |
| Here, self.query and alignment.target are the same sequence. | |
| A typical example is where self is the pairwise alignment between a | |
| chromosome and a transcript, the argument is the pairwise alignment | |
| between the transcript and a sequence (e.g., as obtained by RNA-seq), | |
| and we want to find the alignment of the sequence to the chromosome: | |
| >>> from Bio import Align | |
| >>> aligner = Align.PairwiseAligner() | |
| >>> aligner.mode = 'local' | |
| >>> aligner.open_gap_score = -1 | |
| >>> aligner.extend_gap_score = 0 | |
| >>> chromosome = "AAAAAAAACCCCCCCAAAAAAAAAAAGGGGGGAAAAAAAA" | |
| >>> transcript = "CCCCCCCGGGGGG" | |
| >>> alignments1 = aligner.align(chromosome, transcript) | |
| >>> len(alignments1) | |
| 1 | |
| >>> alignment1 = alignments1[0] | |
| >>> print(alignment1) | |
| target 8 CCCCCCCAAAAAAAAAAAGGGGGG 32 | |
| 0 |||||||-----------|||||| 24 | |
| query 0 CCCCCCC-----------GGGGGG 13 | |
| <BLANKLINE> | |
| >>> sequence = "CCCCGGGG" | |
| >>> alignments2 = aligner.align(transcript, sequence) | |
| >>> len(alignments2) | |
| 1 | |
| >>> alignment2 = alignments2[0] | |
| >>> print(alignment2) | |
| target 3 CCCCGGGG 11 | |
| 0 |||||||| 8 | |
| query 0 CCCCGGGG 8 | |
| <BLANKLINE> | |
| >>> alignment = alignment1.map(alignment2) | |
| >>> print(alignment) | |
| target 11 CCCCAAAAAAAAAAAGGGG 30 | |
| 0 ||||-----------|||| 19 | |
| query 0 CCCC-----------GGGG 8 | |
| <BLANKLINE> | |
| >>> format(alignment, "psl") | |
| '8\t0\t0\t0\t0\t0\t1\t11\t+\tquery\t8\t0\t8\ttarget\t40\t11\t30\t2\t4,4,\t0,4,\t11,26,\n' | |
| Mapping the alignment does not depend on the sequence contents. If we | |
| delete the sequence contents, the same alignment is found in PSL format | |
| (though we obviously lose the ability to print the sequence alignment): | |
| >>> alignment1.target = Seq(None, len(alignment1.target)) | |
| >>> alignment1.query = Seq(None, len(alignment1.query)) | |
| >>> alignment2.target = Seq(None, len(alignment2.target)) | |
| >>> alignment2.query = Seq(None, len(alignment2.query)) | |
| >>> alignment = alignment1.map(alignment2) | |
| >>> format(alignment, "psl") | |
| '8\t0\t0\t0\t0\t0\t1\t11\t+\tquery\t8\t0\t8\ttarget\t40\t11\t30\t2\t4,4,\t0,4,\t11,26,\n' | |
| """ | |
| alignment1, alignment2 = self, alignment | |
| if len(alignment1.query) != len(alignment2.target): | |
| raise ValueError( | |
| "length of alignment1 query sequence (%d) != length of alignment2 target sequence (%d)" | |
| % (len(alignment1.query), len(alignment2.target)) | |
| ) | |
| target = alignment1.target | |
| query = alignment2.query | |
| coordinates1 = alignment1.coordinates | |
| coordinates2 = alignment2.coordinates | |
| n1 = len(alignment1.query) | |
| n2 = len(alignment2.query) | |
| steps1 = numpy.diff(coordinates1, 1) | |
| row = numpy.prod(numpy.sign(steps1), 0) | |
| if (row >= 0).all(): | |
| strand1 = "+" | |
| elif (row <= 0).all(): | |
| strand1 = "-" | |
| else: | |
| raise ValueError("Inconsistent steps in the first alignment") | |
| steps2 = numpy.diff(coordinates2, 1) | |
| row = numpy.prod(numpy.sign(steps2), 0) | |
| if (row >= 0).all(): | |
| strand2 = "+" | |
| elif (row <= 0).all(): | |
| strand2 = "-" | |
| else: | |
| raise ValueError("Inconsistent steps in the second alignment") | |
| if strand1 == "+": | |
| if strand2 == "-": # mapped to reverse strand | |
| coordinates2 = coordinates2.copy() | |
| coordinates2[1, :] = n2 - coordinates2[1, :] | |
| else: # mapped to reverse strand | |
| coordinates1 = coordinates1.copy() | |
| coordinates1[1, :] = n1 - coordinates1[1, :] | |
| coordinates2 = coordinates2.copy() | |
| coordinates2[0, :] = n1 - coordinates2[0, ::-1] | |
| if strand2 == "+": | |
| coordinates2[1, :] = n2 - coordinates2[1, ::-1] | |
| else: # mapped to reverse strand | |
| coordinates2[1, :] = coordinates2[1, ::-1] | |
| steps1 = numpy.diff(coordinates1, 1) | |
| gaps1 = steps1.max(0) | |
| if not ((steps1 == gaps1) | (steps1 <= 0)).all(): | |
| raise ValueError("Unequal step sizes in first alignment") | |
| steps2 = numpy.diff(coordinates2, 1) | |
| gaps2 = steps2.max(0) | |
| if not ((steps2 == gaps2) | (steps2 <= 0)).all(): | |
| raise ValueError("Unequal step sizes in second alignment") | |
| path = [] | |
| tEnd, qEnd = sys.maxsize, sys.maxsize | |
| coordinates1 = iter(coordinates1.transpose()) | |
| tStart1, qStart1 = sys.maxsize, sys.maxsize | |
| for tEnd1, qEnd1 in coordinates1: | |
| if tStart1 < tEnd1 and qStart1 < qEnd1: | |
| break | |
| tStart1, qStart1 = tEnd1, qEnd1 | |
| tStart2, qStart2 = sys.maxsize, sys.maxsize | |
| for tEnd2, qEnd2 in coordinates2.transpose(): | |
| while qStart2 < qEnd2 and tStart2 < tEnd2: | |
| while True: | |
| if tStart2 < qStart1: | |
| if tEnd2 < qStart1: | |
| size = tEnd2 - tStart2 | |
| else: | |
| size = qStart1 - tStart2 | |
| break | |
| elif tStart2 < qEnd1: | |
| offset = tStart2 - qStart1 | |
| if tEnd2 > qEnd1: | |
| size = qEnd1 - tStart2 | |
| else: | |
| size = tEnd2 - tStart2 | |
| qStart = qStart2 | |
| tStart = tStart1 + offset | |
| if tStart > tEnd and qStart > qEnd: | |
| # adding a gap both in target and in query; | |
| # add gap to target first: | |
| path.append([tStart, qEnd]) | |
| qEnd = qStart2 + size | |
| tEnd = tStart + size | |
| path.append([tStart, qStart]) | |
| path.append([tEnd, qEnd]) | |
| break | |
| tStart1, qStart1 = sys.maxsize, sys.maxsize | |
| for tEnd1, qEnd1 in coordinates1: | |
| if tStart1 < tEnd1 and qStart1 < qEnd1: | |
| break | |
| tStart1, qStart1 = tEnd1, qEnd1 | |
| else: | |
| size = qEnd2 - qStart2 | |
| break | |
| qStart2 += size | |
| tStart2 += size | |
| tStart2, qStart2 = tEnd2, qEnd2 | |
| coordinates = numpy.array(path).transpose() | |
| if strand1 != strand2: | |
| coordinates[1, :] = n2 - coordinates[1, :] | |
| sequences = [target, query] | |
| alignment = Alignment(sequences, coordinates) | |
| return alignment | |
| def substitutions(self): | |
| """Return an Array with the number of substitutions of letters in the alignment. | |
| As an example, consider a sequence alignment of two RNA sequences: | |
| >>> from Bio.Align import PairwiseAligner | |
| >>> target = "ATACTTACCTGGCAGGGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTTATCCATTGCACTCCGGATGTGCTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGCATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTTTCCCCTG" # human spliceosomal small nuclear RNA U1 | |
| >>> query = "ATACTTACCTGACAGGGGAGGCACCATGATCACACAGGTGGTCCTCCCAGGGCGAGGCTCTTCCATTGCACTGCGGGAGGGTTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACTGTATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTATCCCCCG" # sea lamprey spliceosomal small RNA U1 | |
| >>> aligner = PairwiseAligner() | |
| >>> aligner.gap_score = -10 | |
| >>> alignments = aligner.align(target, query) | |
| >>> len(alignments) | |
| 1 | |
| >>> alignment = alignments[0] | |
| >>> print(alignment) | |
| target 0 ATACTTACCTGGCAGGGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTT | |
| 0 |||||||||||.||||||||..|||||||||||..|||||||..|||||||||||||||. | |
| query 0 ATACTTACCTGACAGGGGAGGCACCATGATCACACAGGTGGTCCTCCCAGGGCGAGGCTC | |
| <BLANKLINE> | |
| target 60 ATCCATTGCACTCCGGATGTGCTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACT | |
| 60 .|||||||||||.|||..|.|.|||||||||||||||||||||||||||||||||||||| | |
| query 60 TTCCATTGCACTGCGGGAGGGTTGACCCCTGCGATTTCCCCAAATGTGGGAAACTCGACT | |
| <BLANKLINE> | |
| target 120 GCATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTTTCCCCTG 164 | |
| 120 |.||||||||||||||||||||||||||||||||||.|||||.| 164 | |
| query 120 GTATAATTTGTGGTAGTGGGGGACTGCGTTCGCGCTATCCCCCG 164 | |
| <BLANKLINE> | |
| >>> m = alignment.substitutions | |
| >>> print(m) | |
| A C G T | |
| A 28.0 1.0 2.0 1.0 | |
| C 0.0 39.0 1.0 2.0 | |
| G 2.0 0.0 45.0 0.0 | |
| T 2.0 5.0 1.0 35.0 | |
| <BLANKLINE> | |
| Note that the matrix is not symmetric: rows correspond to the target | |
| sequence, and columns to the query sequence. For example, the number | |
| of T's in the target sequence that are aligned to a C in the query | |
| sequence is | |
| >>> m['T', 'C'] | |
| 5.0 | |
| and the number of C's in the query sequence tat are aligned to a T in | |
| the query sequence is | |
| >>> m['C', 'T'] | |
| 2.0 | |
| For some applications (for example, to define a scoring matrix from | |
| the substitution matrix), a symmetric matrix may be preferred, which | |
| can be calculated as follows: | |
| >>> m += m.transpose() | |
| >>> m /= 2.0 | |
| >>> print(m) | |
| A C G T | |
| A 28.0 0.5 2.0 1.5 | |
| C 0.5 39.0 0.5 3.5 | |
| G 2.0 0.5 45.0 0.5 | |
| T 1.5 3.5 0.5 35.0 | |
| <BLANKLINE> | |
| The matrix is now symmetric, with counts divided equally on both sides | |
| of the diagonal: | |
| >>> m['C', 'T'] | |
| 3.5 | |
| >>> m['T', 'C'] | |
| 3.5 | |
| The total number of substitutions between T's and C's in the alignment | |
| is 3.5 + 3.5 = 7. | |
| """ | |
| coordinates = self.coordinates.copy() | |
| sequences = list(self.sequences) | |
| steps = numpy.diff(self.coordinates, 1) | |
| aligned = sum(steps != 0, 0) > 1 | |
| # True for steps in which at least two sequences align, False if a gap | |
| for i, sequence in enumerate(sequences): | |
| row = steps[i, aligned] | |
| if (row >= 0).all(): | |
| pass | |
| elif (row <= 0).all(): | |
| sequences[i] = reverse_complement(sequence, inplace=False) | |
| coordinates[i, :] = len(sequence) - coordinates[i, :] | |
| else: | |
| raise ValueError(f"Inconsistent steps in row {i}") | |
| letters = set() | |
| for sequence in sequences: | |
| try: | |
| s = set(sequence) | |
| except UndefinedSequenceError: | |
| try: | |
| sequence = sequence.seq # SeqRecord confusion | |
| except AttributeError: | |
| pass | |
| for start, end in sequence.defined_ranges: | |
| s = set(sequence[start:end]) | |
| letters.update(s) | |
| else: | |
| letters.update(s) | |
| letters = "".join(sorted(letters)) | |
| m = substitution_matrices.Array(letters, dims=2) | |
| n = len(sequences) | |
| for i1 in range(n): | |
| sequence1 = sequences[i1] | |
| coordinates1 = coordinates[i1, :] | |
| for i2 in range(i1 + 1, n): | |
| sequence2 = sequences[i2] | |
| coordinates2 = coordinates[i2, :] | |
| start1, start2 = sys.maxsize, sys.maxsize | |
| for end1, end2 in zip(coordinates1, coordinates2): | |
| if start1 < end1 and start2 < end2: # aligned | |
| segment1 = sequence1[start1:end1] | |
| segment2 = sequence2[start2:end2] | |
| if len(segment1) != len(segment2): | |
| raise ValueError("Unequal step sizes in alignment") | |
| for c1, c2 in zip(segment1, segment2): | |
| m[c1, c2] += 1.0 | |
| start1, start2 = end1, end2 | |
| return m | |
| def counts(self): | |
| """Return number of identities, mismatches, and gaps, of a pairwise alignment. | |
| >>> aligner = PairwiseAligner(mode='global', match_score=2, mismatch_score=-1) | |
| >>> for alignment in aligner.align("TACCG", "ACG"): | |
| ... print("Score = %.1f:" % alignment.score) | |
| ... c = alignment.counts() # namedtuple | |
| ... print(f"{c.gaps} gaps, {c.identities} identities, {c.mismatches} mismatches") | |
| ... print(alignment) | |
| ... | |
| Score = 6.0: | |
| 2 gaps, 3 identities, 0 mismatches | |
| target 0 TACCG 5 | |
| 0 -||-| 5 | |
| query 0 -AC-G 3 | |
| <BLANKLINE> | |
| Score = 6.0: | |
| 2 gaps, 3 identities, 0 mismatches | |
| target 0 TACCG 5 | |
| 0 -|-|| 5 | |
| query 0 -A-CG 3 | |
| <BLANKLINE> | |
| This classifies each pair of letters in a pairwise alignment into gaps, | |
| perfect matches, or mismatches. It has been defined as a method (not a | |
| property) so that it may in future take optional argument(s) allowing | |
| the behaviour to be customised. These three values are returned as a | |
| namedtuple. This is calculated for all the pairs of sequences in the | |
| alignment. | |
| """ | |
| gaps = identities = mismatches = 0 | |
| for i, seq1 in enumerate(self): | |
| for j, seq2 in enumerate(self): | |
| if i == j: | |
| # Don't count seq1 vs seq2 and seq2 vs seq1 | |
| break | |
| for a, b in zip(seq1, seq2): | |
| if a == "-" or b == "-": | |
| gaps += 1 | |
| elif a == b: | |
| identities += 1 | |
| else: | |
| mismatches += 1 | |
| return AlignmentCounts(gaps, identities, mismatches) | |
| class PairwiseAlignments: | |
| """Implements an iterator over pairwise alignments returned by the aligner. | |
| This class also supports indexing, which is fast for increasing indices, | |
| but may be slow for random access of a large number of alignments. | |
| Note that pairwise aligners can return an astronomical number of alignments, | |
| even for relatively short sequences, if they align poorly to each other. We | |
| therefore recommend to first check the number of alignments, accessible as | |
| len(alignments), which can be calculated quickly even if the number of | |
| alignments is very large. | |
| """ | |
| def __init__(self, seqA, seqB, score, paths): | |
| """Initialize a new PairwiseAlignments object. | |
| Arguments: | |
| - seqA - The first sequence, as a plain string, without gaps. | |
| - seqB - The second sequence, as a plain string, without gaps. | |
| - score - The alignment score. | |
| - paths - An iterator over the paths in the traceback matrix; | |
| each path defines one alignment. | |
| You would normally obtain a PairwiseAlignments object by calling | |
| aligner.align(seqA, seqB), where aligner is a PairwiseAligner object. | |
| """ | |
| self.sequences = [seqA, seqB] | |
| self.score = score | |
| self._paths = paths | |
| self._index = -1 | |
| def __len__(self): | |
| """Return the number of alignments.""" | |
| return len(self._paths) | |
| def __getitem__(self, index): | |
| if not isinstance(index, int): | |
| raise TypeError(f"index must be an integer, not {index.__class__.__name__}") | |
| if index < 0: | |
| index += len(self._paths) | |
| if index == self._index: | |
| return self._alignment | |
| if index < self._index: | |
| self._paths.reset() | |
| self._index = -1 | |
| while True: | |
| try: | |
| alignment = next(self) | |
| except StopIteration: | |
| raise IndexError("index out of range") from None | |
| if self._index == index: | |
| break | |
| return alignment | |
| def __iter__(self): | |
| self._paths.reset() | |
| self._index = -1 | |
| return self | |
| def __next__(self): | |
| path = next(self._paths) | |
| self._index += 1 | |
| coordinates = numpy.array(path) | |
| alignment = Alignment(self.sequences, coordinates) | |
| alignment.score = self.score | |
| self._alignment = alignment | |
| return alignment | |
| class PairwiseAligner(_aligners.PairwiseAligner): | |
| """Performs pairwise sequence alignment using dynamic programming. | |
| This provides functions to get global and local alignments between two | |
| sequences. A global alignment finds the best concordance between all | |
| characters in two sequences. A local alignment finds just the | |
| subsequences that align the best. | |
| To perform a pairwise sequence alignment, first create a PairwiseAligner | |
| object. This object stores the match and mismatch scores, as well as the | |
| gap scores. Typically, match scores are positive, while mismatch scores | |
| and gap scores are negative or zero. By default, the match score is 1, | |
| and the mismatch and gap scores are zero. Based on the values of the gap | |
| scores, a PairwiseAligner object automatically chooses the appropriate | |
| alignment algorithm (the Needleman-Wunsch, Smith-Waterman, Gotoh, or | |
| Waterman-Smith-Beyer global or local alignment algorithm). | |
| Calling the "score" method on the aligner with two sequences as arguments | |
| will calculate the alignment score between the two sequences. | |
| Calling the "align" method on the aligner with two sequences as arguments | |
| will return a generator yielding the alignments between the two | |
| sequences. | |
| Some examples: | |
| >>> from Bio import Align | |
| >>> aligner = Align.PairwiseAligner() | |
| >>> alignments = aligner.align("TACCG", "ACG") | |
| >>> for alignment in sorted(alignments): | |
| ... print("Score = %.1f:" % alignment.score) | |
| ... print(alignment) | |
| ... | |
| Score = 3.0: | |
| target 0 TACCG 5 | |
| 0 -|-|| 5 | |
| query 0 -A-CG 3 | |
| <BLANKLINE> | |
| Score = 3.0: | |
| target 0 TACCG 5 | |
| 0 -||-| 5 | |
| query 0 -AC-G 3 | |
| <BLANKLINE> | |
| Specify the aligner mode as local to generate local alignments: | |
| >>> aligner.mode = 'local' | |
| >>> alignments = aligner.align("TACCG", "ACG") | |
| >>> for alignment in sorted(alignments): | |
| ... print("Score = %.1f:" % alignment.score) | |
| ... print(alignment) | |
| ... | |
| Score = 3.0: | |
| target 1 ACCG 5 | |
| 0 |-|| 4 | |
| query 0 A-CG 3 | |
| <BLANKLINE> | |
| Score = 3.0: | |
| target 1 ACCG 5 | |
| 0 ||-| 4 | |
| query 0 AC-G 3 | |
| <BLANKLINE> | |
| Do a global alignment. Identical characters are given 2 points, | |
| 1 point is deducted for each non-identical character. | |
| >>> aligner.mode = 'global' | |
| >>> aligner.match_score = 2 | |
| >>> aligner.mismatch_score = -1 | |
| >>> for alignment in aligner.align("TACCG", "ACG"): | |
| ... print("Score = %.1f:" % alignment.score) | |
| ... print(alignment) | |
| ... | |
| Score = 6.0: | |
| target 0 TACCG 5 | |
| 0 -||-| 5 | |
| query 0 -AC-G 3 | |
| <BLANKLINE> | |
| Score = 6.0: | |
| target 0 TACCG 5 | |
| 0 -|-|| 5 | |
| query 0 -A-CG 3 | |
| <BLANKLINE> | |
| Same as above, except now 0.5 points are deducted when opening a | |
| gap, and 0.1 points are deducted when extending it. | |
| >>> aligner.open_gap_score = -0.5 | |
| >>> aligner.extend_gap_score = -0.1 | |
| >>> aligner.target_end_gap_score = 0.0 | |
| >>> aligner.query_end_gap_score = 0.0 | |
| >>> for alignment in aligner.align("TACCG", "ACG"): | |
| ... print("Score = %.1f:" % alignment.score) | |
| ... print(alignment) | |
| ... | |
| Score = 5.5: | |
| target 0 TACCG 5 | |
| 0 -|-|| 5 | |
| query 0 -A-CG 3 | |
| <BLANKLINE> | |
| Score = 5.5: | |
| target 0 TACCG 5 | |
| 0 -||-| 5 | |
| query 0 -AC-G 3 | |
| <BLANKLINE> | |
| The alignment function can also use known matrices already included in | |
| Biopython: | |
| >>> from Bio.Align import substitution_matrices | |
| >>> aligner = Align.PairwiseAligner() | |
| >>> aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") | |
| >>> alignments = aligner.align("KEVLA", "EVL") | |
| >>> alignments = list(alignments) | |
| >>> print("Number of alignments: %d" % len(alignments)) | |
| Number of alignments: 1 | |
| >>> alignment = alignments[0] | |
| >>> print("Score = %.1f" % alignment.score) | |
| Score = 13.0 | |
| >>> print(alignment) | |
| target 0 KEVLA 5 | |
| 0 -|||- 5 | |
| query 0 -EVL- 3 | |
| <BLANKLINE> | |
| You can also set the value of attributes directly during construction | |
| of the PairwiseAligner object by providing them as keyword arguments: | |
| >>> aligner = Align.PairwiseAligner(mode='global', match_score=2, mismatch_score=-1) | |
| >>> for alignment in aligner.align("TACCG", "ACG"): | |
| ... print("Score = %.1f:" % alignment.score) | |
| ... print(alignment) | |
| ... | |
| Score = 6.0: | |
| target 0 TACCG 5 | |
| 0 -||-| 5 | |
| query 0 -AC-G 3 | |
| <BLANKLINE> | |
| Score = 6.0: | |
| target 0 TACCG 5 | |
| 0 -|-|| 5 | |
| query 0 -A-CG 3 | |
| <BLANKLINE> | |
| """ | |
| def __init__(self, scoring=None, **kwargs): | |
| """Initialize a new PairwiseAligner as specified by the keyword arguments. | |
| If scoring is None, use the default scoring scheme match = 1.0, | |
| mismatch = 0.0, gap score = 0.0 | |
| If scoring is "blastn", "megablast", or "blastp", use the default | |
| substitution matrix and gap scores for BLASTN, MEGABLAST, or BLASTP, | |
| respectively. | |
| Loops over the remaining keyword arguments and sets them as attributes | |
| on the object. | |
| """ | |
| super().__init__() | |
| if scoring is None: | |
| # use default values: | |
| # match = 1.0 | |
| # mismatch = 0.0 | |
| # gap_score = 0.0 | |
| pass | |
| elif scoring == "blastn": | |
| self.substitution_matrix = substitution_matrices.load("BLASTN") | |
| self.open_gap_score = -7.0 | |
| self.extend_gap_score = -2.0 | |
| elif scoring == "megablast": | |
| self.substitution_matrix = substitution_matrices.load("MEGABLAST") | |
| self.open_gap_score = -2.5 | |
| self.extend_gap_score = -2.5 | |
| elif scoring == "blastp": | |
| self.substitution_matrix = substitution_matrices.load("BLASTP") | |
| self.open_gap_score = -12.0 | |
| self.extend_gap_score = -1.0 | |
| else: | |
| raise ValueError("Unknown scoring scheme '%s'" % scoring) | |
| for name, value in kwargs.items(): | |
| setattr(self, name, value) | |
| def __setattr__(self, key, value): | |
| if key not in dir(_aligners.PairwiseAligner): | |
| # To prevent confusion, don't allow users to create new attributes. | |
| # On CPython, __slots__ can be used for this, but currently | |
| # __slots__ does not behave the same way on PyPy at least. | |
| raise AttributeError("'PairwiseAligner' object has no attribute '%s'" % key) | |
| _aligners.PairwiseAligner.__setattr__(self, key, value) | |
| def align(self, seqA, seqB, strand="+"): | |
| """Return the alignments of two sequences using PairwiseAligner.""" | |
| if isinstance(seqA, (Seq, MutableSeq, SeqRecord)): | |
| sA = bytes(seqA) | |
| else: | |
| sA = seqA | |
| if strand == "+": | |
| sB = seqB | |
| else: # strand == "-": | |
| sB = reverse_complement(seqB, inplace=False) | |
| if isinstance(seqB, (Seq, MutableSeq, SeqRecord)): | |
| sB = bytes(sB) | |
| score, paths = _aligners.PairwiseAligner.align(self, sA, sB, strand) | |
| alignments = PairwiseAlignments(seqA, seqB, score, paths) | |
| return alignments | |
| def score(self, seqA, seqB, strand="+"): | |
| """Return the alignments score of two sequences using PairwiseAligner.""" | |
| if isinstance(seqA, (Seq, MutableSeq, SeqRecord)): | |
| seqA = bytes(seqA) | |
| if strand == "-": | |
| seqB = reverse_complement(seqB, inplace=False) | |
| if isinstance(seqB, (Seq, MutableSeq, SeqRecord)): | |
| seqB = bytes(seqB) | |
| return _aligners.PairwiseAligner.score(self, seqA, seqB, strand) | |
| def __getstate__(self): | |
| state = { | |
| "wildcard": self.wildcard, | |
| "target_internal_open_gap_score": self.target_internal_open_gap_score, | |
| "target_internal_extend_gap_score": self.target_internal_extend_gap_score, | |
| "target_left_open_gap_score": self.target_left_open_gap_score, | |
| "target_left_extend_gap_score": self.target_left_extend_gap_score, | |
| "target_right_open_gap_score": self.target_right_open_gap_score, | |
| "target_right_extend_gap_score": self.target_right_extend_gap_score, | |
| "query_internal_open_gap_score": self.query_internal_open_gap_score, | |
| "query_internal_extend_gap_score": self.query_internal_extend_gap_score, | |
| "query_left_open_gap_score": self.query_left_open_gap_score, | |
| "query_left_extend_gap_score": self.query_left_extend_gap_score, | |
| "query_right_open_gap_score": self.query_right_open_gap_score, | |
| "query_right_extend_gap_score": self.query_right_extend_gap_score, | |
| "mode": self.mode, | |
| } | |
| if self.substitution_matrix is None: | |
| state["match_score"] = self.match_score | |
| state["mismatch_score"] = self.mismatch_score | |
| else: | |
| state["substitution_matrix"] = self.substitution_matrix | |
| return state | |
| def __setstate__(self, state): | |
| self.wildcard = state["wildcard"] | |
| self.target_internal_open_gap_score = state["target_internal_open_gap_score"] | |
| self.target_internal_extend_gap_score = state[ | |
| "target_internal_extend_gap_score" | |
| ] | |
| self.target_left_open_gap_score = state["target_left_open_gap_score"] | |
| self.target_left_extend_gap_score = state["target_left_extend_gap_score"] | |
| self.target_right_open_gap_score = state["target_right_open_gap_score"] | |
| self.target_right_extend_gap_score = state["target_right_extend_gap_score"] | |
| self.query_internal_open_gap_score = state["query_internal_open_gap_score"] | |
| self.query_internal_extend_gap_score = state["query_internal_extend_gap_score"] | |
| self.query_left_open_gap_score = state["query_left_open_gap_score"] | |
| self.query_left_extend_gap_score = state["query_left_extend_gap_score"] | |
| self.query_right_open_gap_score = state["query_right_open_gap_score"] | |
| self.query_right_extend_gap_score = state["query_right_extend_gap_score"] | |
| self.mode = state["mode"] | |
| substitution_matrix = state.get("substitution_matrix") | |
| if substitution_matrix is None: | |
| self.match_score = state["match_score"] | |
| self.mismatch_score = state["mismatch_score"] | |
| else: | |
| self.substitution_matrix = substitution_matrix | |
| class PairwiseAlignment(Alignment): | |
| """Represents a pairwise sequence alignment. | |
| Internally, the pairwise alignment is stored as the path through | |
| the traceback matrix, i.e. a tuple of pairs of indices corresponding | |
| to the vertices of the path in the traceback matrix. | |
| """ | |
| def __init__(self, target, query, path, score): | |
| """Initialize a new PairwiseAlignment object. | |
| Arguments: | |
| - target - The first sequence, as a plain string, without gaps. | |
| - query - The second sequence, as a plain string, without gaps. | |
| - path - The path through the traceback matrix, defining an | |
| alignment. | |
| - score - The alignment score. | |
| You would normally obtain a PairwiseAlignment object by iterating | |
| over a PairwiseAlignments object. | |
| """ | |
| warnings.warn( | |
| "The PairwiseAlignment class is deprecated; please use the " | |
| "Alignment class instead. Note that the coordinates attribute of " | |
| "an Alignment object is a numpy array and the transpose of the " | |
| "path attribute of a PairwiseAlignment object.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| sequences = [target, query] | |
| coordinates = numpy.array(path).transpose() | |
| super().__init__(sequences, coordinates) | |
| self.score = score | |
| # fmt: off | |
| formats = ( | |
| "a2m", # A2M files created by align2model or hmmscore | |
| "bed", # BED (Browser Extensible Data) files | |
| "bigbed", # bigBed format | |
| "bigmaf", # MAF file saved as a bigBed file | |
| "bigpsl", # PSL file saved as a bigBed file | |
| "clustal", # clustal output from CLUSTAL W and other tools. | |
| "emboss", # emboss output from EMBOSS tools such as needle, water | |
| "exonerate", # Exonerate pairwise alignment output | |
| "fasta", # FASTA format with gaps represented by dashes | |
| "hhr", # hhr files generated by HHsearch, HHblits in HH-suite | |
| "maf", # MAF (Multiple Alignment Format) format. | |
| "mauve", # xmfa output from Mauve/ProgressiveMauve | |
| "msf", # MSF format produced by GCG PileUp and LocalPileUp | |
| "nexus", # Nexus file format | |
| "phylip", # Alignment format for input files for PHYLIP tools | |
| "psl", # Pattern Space Layout (PSL) format generated by Blat | |
| "sam", # Sequence Alignment/Map (SAM) format | |
| "stockholm", # Stockholm file format used by PFAM and others | |
| "tabular", # Tabular output from BLAST or FASTA | |
| ) | |
| # fmt: on | |
| _modules = {} | |
| def _load(fmt): | |
| fmt = fmt.lower() | |
| try: | |
| return _modules[fmt] | |
| except KeyError: | |
| pass | |
| if fmt not in formats: | |
| raise ValueError("Unknown file format %s" % fmt) | |
| module = importlib.import_module(f"Bio.Align.{fmt}") | |
| _modules[fmt] = module | |
| return module | |
| def write(alignments, target, fmt, *args, **kwargs): | |
| """Write alignments to a file. | |
| Arguments: | |
| - alignments - List (or iterator) of Alignment objects, or a single | |
| Alignment. | |
| - target - File or file-like object to write to, or filename as string. | |
| - fmt - String describing the file format (case-insensitive). | |
| Note if providing a file or file-like object, your code should close the | |
| target after calling this function, or call .flush(), to ensure the data | |
| gets flushed to disk. | |
| Returns the number of alignments written (as an integer). | |
| """ | |
| if isinstance(alignments, Alignment): | |
| alignments = [alignments] | |
| module = _load(fmt) | |
| try: | |
| writer = module.AlignmentWriter | |
| except AttributeError: | |
| raise ValueError( | |
| f"File writing has not yet been implemented for the {fmt} format" | |
| ) | |
| return writer(target, *args, **kwargs).write_file(alignments) | |
| def parse(source, fmt): | |
| """Parse an alignment file and return an iterator over alignments. | |
| Arguments: | |
| - source - File or file-like object to read from, or filename as string. | |
| - fmt - String describing the file format (case-insensitive). | |
| Typical usage, opening a file to read in, and looping over the aligments: | |
| >>> from Bio import Align | |
| >>> filename = "Exonerate/exn_22_m_ner_cigar.exn" | |
| >>> for alignment in Align.parse(filename, "exonerate"): | |
| ... print("Number of sequences in alignment", len(alignment)) | |
| ... print("Alignment score:", alignment.score) | |
| Number of sequences in alignment 2 | |
| Alignment score: 6150.0 | |
| Number of sequences in alignment 2 | |
| Alignment score: 502.0 | |
| Number of sequences in alignment 2 | |
| Alignment score: 440.0 | |
| For lazy-loading file formats such as bigMaf, for which the file contents | |
| is read on demand only, ensure that the file remains open while extracting | |
| alignment data. | |
| You can use the Bio.Align.read(...) function when the file contains only | |
| one alignment. | |
| """ | |
| module = _load(fmt) | |
| alignments = module.AlignmentIterator(source) | |
| return alignments | |
| def read(handle, fmt): | |
| """Parse a file containing one alignment, and return it. | |
| Arguments: | |
| - source - File or file-like object to read from, or filename as string. | |
| - fmt - String describing the file format (case-insensitive). | |
| This function is for use parsing alignment files containing exactly one | |
| alignment. For example, reading a Clustal file: | |
| >>> from Bio import Align | |
| >>> alignment = Align.read("Clustalw/opuntia.aln", "clustal") | |
| >>> print("Alignment shape:", alignment.shape) | |
| Alignment shape: (7, 156) | |
| >>> for sequence in alignment.sequences: | |
| ... print(sequence.id, len(sequence)) | |
| gi|6273285|gb|AF191659.1|AF191 146 | |
| gi|6273284|gb|AF191658.1|AF191 148 | |
| gi|6273287|gb|AF191661.1|AF191 146 | |
| gi|6273286|gb|AF191660.1|AF191 146 | |
| gi|6273290|gb|AF191664.1|AF191 150 | |
| gi|6273289|gb|AF191663.1|AF191 150 | |
| gi|6273291|gb|AF191665.1|AF191 156 | |
| If the file contains no records, or more than one record, an exception is | |
| raised. For example: | |
| >>> from Bio import Align | |
| >>> filename = "Exonerate/exn_22_m_ner_cigar.exn" | |
| >>> alignment = Align.read(filename, "exonerate") | |
| Traceback (most recent call last): | |
| ... | |
| ValueError: More than one alignment found in file | |
| Use the Bio.Align.parse function if you want to read a file containing | |
| more than one alignment. | |
| """ | |
| alignments = parse(handle, fmt) | |
| try: | |
| alignment = next(alignments) | |
| except StopIteration: | |
| raise ValueError("No alignments found in file") from None | |
| try: | |
| next(alignments) | |
| raise ValueError("More than one alignment found in file") | |
| except StopIteration: | |
| pass | |
| return alignment | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |