Spaces:
No application file
No application file
| # Copyright 2000-2003 Jeff Chang. | |
| # Copyright 2001-2008 Brad Chapman. | |
| # Copyright 2005-2016 by Peter Cock. | |
| # Copyright 2006-2009 Michiel de Hoon. | |
| # All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Represent a Sequence Feature holding info about a part of a sequence. | |
| This is heavily modeled after the Biocorba SeqFeature objects, and | |
| may be pretty biased towards GenBank stuff since I'm writing it | |
| for the GenBank parser output... | |
| What's here: | |
| Base class to hold a Feature | |
| ---------------------------- | |
| Classes: | |
| - SeqFeature | |
| Hold information about a Reference | |
| ---------------------------------- | |
| This is an attempt to create a General class to hold Reference type | |
| information. | |
| Classes: | |
| - Reference | |
| Specify locations of a feature on a Sequence | |
| -------------------------------------------- | |
| This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. | |
| This has the advantages of allowing us to handle fuzzy stuff in case anyone | |
| needs it, and also be compatible with BioPerl etc and BioSQL. | |
| Classes: | |
| - Location - abstract base class of SimpleLocation and CompoundLocation. | |
| - SimpleLocation - Specify the start and end location of a feature. | |
| - CompoundLocation - Collection of SimpleLocation objects (for joins etc). | |
| - Position - abstract base class of ExactPosition, WithinPosition, | |
| BetweenPosition, AfterPosition, OneOfPosition, UncertainPosition, and | |
| UnknownPosition. | |
| - ExactPosition - Specify the position as being exact. | |
| - WithinPosition - Specify a position occurring within some range. | |
| - BetweenPosition - Specify a position occurring between a range (OBSOLETE?). | |
| - BeforePosition - Specify the position as being found before some base. | |
| - AfterPosition - Specify the position as being found after some base. | |
| - OneOfPosition - Specify a position consisting of multiple alternative positions. | |
| - UncertainPosition - Specify a specific position which is uncertain. | |
| - UnknownPosition - Represents missing information like '?' in UniProt. | |
| Exceptions: | |
| - LocationParserError - Exception indicating a failure to parse a location | |
| string. | |
| """ | |
| import functools | |
| import re | |
| import warnings | |
| from abc import ABC, abstractmethod | |
| from Bio import BiopythonParserWarning | |
| from Bio import BiopythonDeprecationWarning | |
| from Bio.Seq import MutableSeq | |
| from Bio.Seq import reverse_complement | |
| from Bio.Seq import Seq | |
| # Regular expressions for location parsing | |
| _reference = r"(?:[a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)" | |
| _oneof_position = r"one\-of\(\d+[,\d+]+\)" | |
| _oneof_location = rf"[<>]?(?:\d+|{_oneof_position})\.\.[<>]?(?:\d+|{_oneof_position})" | |
| _any_location = rf"({_reference}?{_oneof_location}|complement\({_oneof_location}\)|[^,]+|complement\([^,]+\))" | |
| _split = re.compile(_any_location).split | |
| assert _split("123..145")[1::2] == ["123..145"] | |
| assert _split("123..145,200..209")[1::2] == ["123..145", "200..209"] | |
| assert _split("one-of(200,203)..300")[1::2] == ["one-of(200,203)..300"] | |
| assert _split("complement(123..145),200..209")[1::2] == [ | |
| "complement(123..145)", | |
| "200..209", | |
| ] | |
| assert _split("123..145,one-of(200,203)..209")[1::2] == [ | |
| "123..145", | |
| "one-of(200,203)..209", | |
| ] | |
| assert _split("123..145,one-of(200,203)..one-of(209,211),300")[1::2] == [ | |
| "123..145", | |
| "one-of(200,203)..one-of(209,211)", | |
| "300", | |
| ] | |
| assert _split("123..145,complement(one-of(200,203)..one-of(209,211)),300")[1::2] == [ | |
| "123..145", | |
| "complement(one-of(200,203)..one-of(209,211))", | |
| "300", | |
| ] | |
| assert _split("123..145,200..one-of(209,211),300")[1::2] == [ | |
| "123..145", | |
| "200..one-of(209,211)", | |
| "300", | |
| ] | |
| assert _split("123..145,200..one-of(209,211)")[1::2] == [ | |
| "123..145", | |
| "200..one-of(209,211)", | |
| ] | |
| assert _split( | |
| "complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905" | |
| )[1::2] == [ | |
| "complement(149815..150200)", | |
| "complement(293787..295573)", | |
| "NC_016402.1:6618..6676", | |
| "181647..181905", | |
| ] | |
| _pair_location = r"[<>]?-?\d+\.\.[<>]?-?\d+" | |
| _between_location = r"\d+\^\d+" | |
| _within_position = r"\(\d+\.\d+\)" | |
| _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % ( | |
| _within_position, | |
| _within_position, | |
| ) | |
| _within_position = r"\((\d+)\.(\d+)\)" | |
| _re_within_position = re.compile(_within_position) | |
| assert _re_within_position.match("(3.9)") | |
| _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position) | |
| _oneof_position = r"one\-of\((\d+[,\d+]+)\)" | |
| _re_oneof_position = re.compile(_oneof_position) | |
| assert _re_oneof_position.match("one-of(6,9)") | |
| assert not _re_oneof_position.match("one-of(3)") | |
| assert _re_oneof_position.match("one-of(3,6)") | |
| assert _re_oneof_position.match("one-of(3,6,9)") | |
| _solo_location = r"[<>]?\d+" | |
| _solo_bond = r"bond\(%s\)" % _solo_location | |
| _re_location_category = re.compile( | |
| r"^(?P<pair>%s)|(?P<between>%s)|(?P<within>%s)|(?P<oneof>%s)|(?P<bond>%s)|(?P<solo>%s)$" | |
| % ( | |
| _pair_location, | |
| _between_location, | |
| _within_location, | |
| _oneof_location, | |
| _solo_bond, | |
| _solo_location, | |
| ) | |
| ) | |
| class LocationParserError(ValueError): | |
| """Could not parse a feature location string.""" | |
| pass | |
| class SeqFeature: | |
| """Represent a Sequence Feature on an object. | |
| Attributes: | |
| - location - the location of the feature on the sequence (SimpleLocation) | |
| - type - the specified type of the feature (ie. CDS, exon, repeat...) | |
| - location_operator - a string specifying how this SeqFeature may | |
| be related to others. For example, in the example GenBank feature | |
| shown below, the location_operator would be "join". This is a proxy | |
| for feature.location.operator and only applies to compound locations. | |
| - strand - A value specifying on which strand (of a DNA sequence, for | |
| instance) the feature deals with. 1 indicates the plus strand, -1 | |
| indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), | |
| while the default of None indicates that strand doesn't apply (dot in GFF3, | |
| e.g. features on proteins). Note this is a shortcut for accessing the | |
| strand property of the feature's location. | |
| - id - A string identifier for the feature. | |
| - ref - A reference to another sequence. This could be an accession | |
| number for some different sequence. Note this is a shortcut for the | |
| reference property of the feature's location. | |
| - ref_db - A different database for the reference accession number. | |
| Note this is a shortcut for the reference property of the location | |
| - qualifiers - A dictionary of qualifiers on the feature. These are | |
| analogous to the qualifiers from a GenBank feature table. The keys of | |
| the dictionary are qualifier names, the values are the qualifier | |
| values. | |
| """ | |
| def __init__( | |
| self, | |
| location=None, | |
| type="", | |
| location_operator="", | |
| strand=None, | |
| id="<unknown id>", | |
| qualifiers=None, | |
| sub_features=None, | |
| ref=None, | |
| ref_db=None, | |
| ): | |
| """Initialize a SeqFeature on a sequence. | |
| location can either be a SimpleLocation (with strand argument also | |
| given if required), or None. | |
| e.g. With no strand, on the forward strand, and on the reverse strand: | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> f1 = SeqFeature(SimpleLocation(5, 10), type="domain") | |
| >>> f1.strand == f1.location.strand == None | |
| True | |
| >>> f2 = SeqFeature(SimpleLocation(7, 110, strand=1), type="CDS") | |
| >>> f2.strand == f2.location.strand == +1 | |
| True | |
| >>> f3 = SeqFeature(SimpleLocation(9, 108, strand=-1), type="CDS") | |
| >>> f3.strand == f3.location.strand == -1 | |
| True | |
| For exact start/end positions, an integer can be used (as shown above) | |
| as shorthand for the ExactPosition object. For non-exact locations, the | |
| SimpleLocation must be specified via the appropriate position objects. | |
| Note that the strand, ref and ref_db arguments to the SeqFeature are | |
| now deprecated and will later be removed. Set them via the location | |
| object instead. | |
| Note that location_operator and sub_features arguments can no longer | |
| be used, instead do this via the CompoundLocation object. | |
| """ | |
| if ( | |
| location is not None | |
| and not isinstance(location, SimpleLocation) | |
| and not isinstance(location, CompoundLocation) | |
| ): | |
| raise TypeError( | |
| "SimpleLocation, CompoundLocation (or None) required for the location" | |
| ) | |
| self.location = location | |
| self.type = type | |
| if location_operator: | |
| warnings.warn( | |
| "Using the location_operator argument is deprecated, and will be removed in a future release. " | |
| "Please do this via the CompoundLocation object instead.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| self.location_operator = location_operator | |
| if strand is not None: | |
| warnings.warn( | |
| "Using the strand argument is deprecated, and will be removed in a future release. " | |
| "Please set it via the location object instead.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| self.strand = strand | |
| self.id = id | |
| self.qualifiers = {} | |
| if qualifiers is not None: | |
| self.qualifiers.update(qualifiers) | |
| if sub_features is not None: | |
| raise TypeError("Rather than sub_features, use a CompoundLocation") | |
| if ref is not None: | |
| warnings.warn( | |
| "Using the ref argument is deprecated, and will be removed in a future release. " | |
| "Please set it via the location object instead.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| self.ref = ref | |
| if ref_db is not None: | |
| warnings.warn( | |
| "Using the ref_db argument is deprecated, and will be removed in a future release. " | |
| "Please set it via the location object instead.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| self.ref_db = ref_db | |
| def _get_strand(self): | |
| """Get function for the strand property (PRIVATE).""" | |
| return self.location.strand | |
| def _set_strand(self, value): | |
| """Set function for the strand property (PRIVATE).""" | |
| try: | |
| self.location.strand = value | |
| except AttributeError: | |
| if self.location is None: | |
| if value is not None: | |
| raise ValueError("Can't set strand without a location.") from None | |
| else: | |
| raise | |
| strand = property( | |
| fget=_get_strand, | |
| fset=_set_strand, | |
| doc="""Feature's strand | |
| This is a shortcut for feature.location.strand | |
| """, | |
| ) | |
| def _get_ref(self): | |
| """Get function for the reference property (PRIVATE).""" | |
| try: | |
| return self.location.ref | |
| except AttributeError: | |
| return None | |
| def _set_ref(self, value): | |
| """Set function for the reference property (PRIVATE).""" | |
| try: | |
| self.location.ref = value | |
| except AttributeError: | |
| if self.location is None: | |
| if value is not None: | |
| raise ValueError("Can't set ref without a location.") from None | |
| else: | |
| raise | |
| ref = property( | |
| fget=_get_ref, | |
| fset=_set_ref, | |
| doc="""Feature location reference (e.g. accession). | |
| This is a shortcut for feature.location.ref | |
| """, | |
| ) | |
| def _get_ref_db(self): | |
| """Get function for the database reference property (PRIVATE).""" | |
| try: | |
| return self.location.ref_db | |
| except AttributeError: | |
| return None | |
| def _set_ref_db(self, value): | |
| """Set function for the database reference property (PRIVATE).""" | |
| self.location.ref_db = value | |
| ref_db = property( | |
| fget=_get_ref_db, | |
| fset=_set_ref_db, | |
| doc="""Feature location reference's database. | |
| This is a shortcut for feature.location.ref_db | |
| """, | |
| ) | |
| def _get_location_operator(self): | |
| """Get function for the location operator property (PRIVATE).""" | |
| try: | |
| return self.location.operator | |
| except AttributeError: | |
| return None | |
| def _set_location_operator(self, value): | |
| """Set function for the location operator property (PRIVATE).""" | |
| if value: | |
| if isinstance(self.location, CompoundLocation): | |
| self.location.operator = value | |
| elif self.location is None: | |
| raise ValueError( | |
| f"Location is None so can't set its operator (to {value!r})" | |
| ) | |
| else: | |
| raise ValueError(f"Only CompoundLocation gets an operator ({value!r})") | |
| location_operator = property( | |
| fget=_get_location_operator, | |
| fset=_set_location_operator, | |
| doc="Location operator for compound locations (e.g. join).", | |
| ) | |
| def __eq__(self, other): | |
| """Check if two SeqFeature objects should be considered equal.""" | |
| return ( | |
| isinstance(other, SeqFeature) | |
| and self.id == other.id | |
| and self.type == other.type | |
| and self.location == other.location | |
| and self.qualifiers == other.qualifiers | |
| ) | |
| def __repr__(self): | |
| """Represent the feature as a string for debugging.""" | |
| answer = f"{self.__class__.__name__}({self.location!r}" | |
| if self.type: | |
| answer += f", type={self.type!r}" | |
| if self.location_operator: | |
| answer += f", location_operator={self.location_operator!r}" | |
| if self.id and self.id != "<unknown id>": | |
| answer += f", id={self.id!r}" | |
| if self.qualifiers: | |
| answer += ", qualifiers=..." | |
| if self.ref: | |
| answer += f", ref={self.ref!r}" | |
| if self.ref_db: | |
| answer += f", ref_db={self.ref_db!r}" | |
| answer += ")" | |
| return answer | |
| def __str__(self): | |
| """Return the full feature as a python string.""" | |
| out = f"type: {self.type}\n" | |
| out += f"location: {self.location}\n" | |
| if self.id and self.id != "<unknown id>": | |
| out += f"id: {self.id}\n" | |
| out += "qualifiers:\n" | |
| for qual_key in sorted(self.qualifiers): | |
| out += f" Key: {qual_key}, Value: {self.qualifiers[qual_key]}\n" | |
| return out | |
| def _shift(self, offset): | |
| """Return a copy of the feature with its location shifted (PRIVATE). | |
| The annotation qualifiers are copied. | |
| """ | |
| return SeqFeature( | |
| location=self.location._shift(offset), | |
| type=self.type, | |
| id=self.id, | |
| qualifiers=self.qualifiers.copy(), | |
| ) | |
| def _flip(self, length): | |
| """Return a copy of the feature with its location flipped (PRIVATE). | |
| The argument length gives the length of the parent sequence. For | |
| example a location 0..20 (+1 strand) with parent length 30 becomes | |
| after flipping 10..30 (-1 strand). Strandless (None) or unknown | |
| strand (0) remain like that - just their end points are changed. | |
| The annotation qualifiers are copied. | |
| """ | |
| return SeqFeature( | |
| location=self.location._flip(length), | |
| type=self.type, | |
| id=self.id, | |
| qualifiers=self.qualifiers.copy(), | |
| ) | |
| def extract(self, parent_sequence, references=None): | |
| """Extract the feature's sequence from supplied parent sequence. | |
| The parent_sequence can be a Seq like object or a string, and will | |
| generally return an object of the same type. The exception to this is | |
| a MutableSeq as the parent sequence will return a Seq object. | |
| This should cope with complex locations including complements, joins | |
| and fuzzy positions. Even mixed strand features should work! This | |
| also covers features on protein sequences (e.g. domains), although | |
| here reverse strand features are not permitted. If the | |
| location refers to other records, they must be supplied in the | |
| optional dictionary references. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
| >>> f = SeqFeature(SimpleLocation(8, 15), type="domain") | |
| >>> f.extract(seq) | |
| Seq('VALIVIC') | |
| If the SimpleLocation is None, e.g. when parsing invalid locus | |
| locations in the GenBank parser, extract() will raise a ValueError. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqFeature import SeqFeature | |
| >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
| >>> f = SeqFeature(None, type="domain") | |
| >>> f.extract(seq) | |
| Traceback (most recent call last): | |
| ... | |
| ValueError: The feature's .location is None. Check the sequence file for a valid location. | |
| Note - currently only compound features of type "join" are supported. | |
| """ | |
| if self.location is None: | |
| raise ValueError( | |
| "The feature's .location is None. Check the " | |
| "sequence file for a valid location." | |
| ) | |
| return self.location.extract(parent_sequence, references=references) | |
| def translate( | |
| self, | |
| parent_sequence, | |
| table="Standard", | |
| start_offset=None, | |
| stop_symbol="*", | |
| to_stop=False, | |
| cds=None, | |
| gap=None, | |
| ): | |
| """Get a translation of the feature's sequence. | |
| This method is intended for CDS or other features that code proteins | |
| and is a shortcut that will both extract the feature and | |
| translate it, taking into account the codon_start and transl_table | |
| qualifiers, if they are present. If they are not present the | |
| value of the arguments "table" and "start_offset" are used. | |
| The "cds" parameter is set to "True" if the feature is of type | |
| "CDS" but can be overridden by giving an explicit argument. | |
| The arguments stop_symbol, to_stop and gap have the same meaning | |
| as Seq.translate, refer to that documentation for further information. | |
| Arguments: | |
| - parent_sequence - A DNA or RNA sequence. | |
| - table - Which codon table to use if there is no transl_table | |
| qualifier for this feature. This can be either a name | |
| (string), an NCBI identifier (integer), or a CodonTable | |
| object (useful for non-standard genetic codes). This | |
| defaults to the "Standard" table. | |
| - start_offset - offset at which the first complete codon of a | |
| coding feature can be found, relative to the first base of | |
| that feature. Has a valid value of 0, 1 or 2. NOTE: this | |
| uses python's 0-based numbering whereas the codon_start | |
| qualifier in files from NCBI use 1-based numbering. | |
| Will override a codon_start qualifier | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA") | |
| >>> f = SeqFeature(SimpleLocation(0, 30), type="CDS") | |
| >>> f.qualifiers['transl_table'] = [11] | |
| Note that features of type CDS are subject to the usual | |
| checks at translation. But you can override this behavior | |
| by giving explicit arguments: | |
| >>> f.translate(seq, cds=False) | |
| Seq('GYTYR*CL**') | |
| Now use the start_offset argument to change the frame. Note | |
| this uses python 0-based numbering. | |
| >>> f.translate(seq, start_offset=1, cds=False) | |
| Seq('VTLTDNVSD') | |
| Alternatively use the codon_start qualifier to do the same | |
| thing. Note: this uses 1-based numbering, which is found | |
| in files from NCBI. | |
| >>> f.qualifiers['codon_start'] = [2] | |
| >>> f.translate(seq, cds=False) | |
| Seq('VTLTDNVSD') | |
| """ | |
| # see if this feature should be translated in a different | |
| # frame using the "codon_start" qualifier | |
| if start_offset is None: | |
| try: | |
| start_offset = int(self.qualifiers["codon_start"][0]) - 1 | |
| except KeyError: | |
| start_offset = 0 | |
| if start_offset not in [0, 1, 2]: | |
| raise ValueError( | |
| "The start_offset must be 0, 1, or 2. " | |
| f"The supplied value is {start_offset}. " | |
| "Check the value of either the codon_start qualifier " | |
| "or the start_offset argument" | |
| ) | |
| feat_seq = self.extract(parent_sequence)[start_offset:] | |
| codon_table = self.qualifiers.get("transl_table", [table])[0] | |
| if cds is None: | |
| cds = self.type == "CDS" | |
| return feat_seq.translate( | |
| table=codon_table, | |
| stop_symbol=stop_symbol, | |
| to_stop=to_stop, | |
| cds=cds, | |
| gap=gap, | |
| ) | |
| def __bool__(self): | |
| """Boolean value of an instance of this class (True). | |
| This behavior is for backwards compatibility, since until the | |
| __len__ method was added, a SeqFeature always evaluated as True. | |
| Note that in comparison, Seq objects, strings, lists, etc, will all | |
| evaluate to False if they have length zero. | |
| WARNING: The SeqFeature may in future evaluate to False when its | |
| length is zero (in order to better match normal python behavior)! | |
| """ | |
| return True | |
| def __len__(self): | |
| """Return the length of the region where the feature is located. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
| >>> f = SeqFeature(SimpleLocation(8, 15), type="domain") | |
| >>> len(f) | |
| 7 | |
| >>> f.extract(seq) | |
| Seq('VALIVIC') | |
| >>> len(f.extract(seq)) | |
| 7 | |
| This is a proxy for taking the length of the feature's location: | |
| >>> len(f.location) | |
| 7 | |
| For simple features this is the same as the region spanned (end | |
| position minus start position using Pythonic counting). However, for | |
| a compound location (e.g. a CDS as the join of several exons) the | |
| gaps are not counted (e.g. introns). This ensures that len(f) matches | |
| len(f.extract(parent_seq)), and also makes sure things work properly | |
| with features wrapping the origin etc. | |
| """ | |
| return len(self.location) | |
| def __iter__(self): | |
| """Iterate over the parent positions within the feature. | |
| The iteration order is strand aware, and can be thought of as moving | |
| along the feature using the parent sequence coordinates: | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain") | |
| >>> len(f) | |
| 5 | |
| >>> for i in f: print(i) | |
| 9 | |
| 8 | |
| 7 | |
| 6 | |
| 5 | |
| >>> list(f) | |
| [9, 8, 7, 6, 5] | |
| This is a proxy for iterating over the location, | |
| >>> list(f.location) | |
| [9, 8, 7, 6, 5] | |
| """ | |
| return iter(self.location) | |
| def __contains__(self, value): | |
| """Check if an integer position is within the feature. | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> f = SeqFeature(SimpleLocation(5, 10, strand=-1), type="domain") | |
| >>> len(f) | |
| 5 | |
| >>> [i for i in range(15) if i in f] | |
| [5, 6, 7, 8, 9] | |
| For example, to see which features include a SNP position, you could | |
| use this: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") | |
| >>> for f in record.features: | |
| ... if 1750 in f: | |
| ... print("%s %s" % (f.type, f.location)) | |
| source [0:154478](+) | |
| gene [1716:4347](-) | |
| tRNA join{[4310:4347](-), [1716:1751](-)} | |
| Note that for a feature defined as a join of several subfeatures (e.g. | |
| the union of several exons) the gaps are not checked (e.g. introns). | |
| In this example, the tRNA location is defined in the GenBank file as | |
| complement(join(1717..1751,4311..4347)), so that position 1760 falls | |
| in the gap: | |
| >>> for f in record.features: | |
| ... if 1760 in f: | |
| ... print("%s %s" % (f.type, f.location)) | |
| source [0:154478](+) | |
| gene [1716:4347](-) | |
| Note that additional care may be required with fuzzy locations, for | |
| example just before a BeforePosition: | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> from Bio.SeqFeature import BeforePosition | |
| >>> f = SeqFeature(SimpleLocation(BeforePosition(3), 8), type="domain") | |
| >>> len(f) | |
| 5 | |
| >>> [i for i in range(10) if i in f] | |
| [3, 4, 5, 6, 7] | |
| Note that is is a proxy for testing membership on the location. | |
| >>> [i for i in range(10) if i in f.location] | |
| [3, 4, 5, 6, 7] | |
| """ | |
| return value in self.location | |
| # --- References | |
| # TODO -- Will this hold PubMed and Medline information decently? | |
| class Reference: | |
| """Represent a Generic Reference object. | |
| Attributes: | |
| - location - A list of Location objects specifying regions of | |
| the sequence that the references correspond to. If no locations are | |
| specified, the entire sequence is assumed. | |
| - authors - A big old string, or a list split by author, of authors | |
| for the reference. | |
| - title - The title of the reference. | |
| - journal - Journal the reference was published in. | |
| - medline_id - A medline reference for the article. | |
| - pubmed_id - A pubmed reference for the article. | |
| - comment - A place to stick any comments about the reference. | |
| """ | |
| def __init__(self): | |
| """Initialize the class.""" | |
| self.location = [] | |
| self.authors = "" | |
| self.consrtm = "" | |
| self.title = "" | |
| self.journal = "" | |
| self.medline_id = "" | |
| self.pubmed_id = "" | |
| self.comment = "" | |
| def __str__(self): | |
| """Return the full Reference object as a python string.""" | |
| out = "" | |
| for single_location in self.location: | |
| out += f"location: {single_location}\n" | |
| out += f"authors: {self.authors}\n" | |
| if self.consrtm: | |
| out += f"consrtm: {self.consrtm}\n" | |
| out += f"title: {self.title}\n" | |
| out += f"journal: {self.journal}\n" | |
| out += f"medline id: {self.medline_id}\n" | |
| out += f"pubmed id: {self.pubmed_id}\n" | |
| out += f"comment: {self.comment}\n" | |
| return out | |
| def __repr__(self): | |
| """Represent the Reference object as a string for debugging.""" | |
| # TODO - Update this is __init__ later accepts values | |
| return f"{self.__class__.__name__}(title={self.title!r}, ...)" | |
| def __eq__(self, other): | |
| """Check if two Reference objects should be considered equal. | |
| Note prior to Biopython 1.70 the location was not compared, as | |
| until then __eq__ for the SimpleLocation class was not defined. | |
| """ | |
| return ( | |
| self.authors == other.authors | |
| and self.consrtm == other.consrtm | |
| and self.title == other.title | |
| and self.journal == other.journal | |
| and self.medline_id == other.medline_id | |
| and self.pubmed_id == other.pubmed_id | |
| and self.comment == other.comment | |
| and self.location == other.location | |
| ) | |
| # --- Handling feature locations | |
| class Location(ABC): | |
| """Abstract base class representing a location.""" | |
| def __repr__(self): | |
| """Represent the Location object as a string for debugging.""" | |
| return f"{self.__class__.__name__}(...)" | |
| def fromstring(text, length=None, circular=False, stranded=True): | |
| """Create a Location object from a string. | |
| This should accept any valid location string in the INSDC Feature Table | |
| format (https://www.insdc.org/submitting-standards/feature-table/) as | |
| used in GenBank, DDBJ and EMBL files. | |
| Simple examples: | |
| >>> Location.fromstring("123..456", 1000) | |
| SimpleLocation(ExactPosition(122), ExactPosition(456), strand=1) | |
| >>> Location.fromstring("complement(<123..>456)", 1000) | |
| SimpleLocation(BeforePosition(122), AfterPosition(456), strand=-1) | |
| A more complex location using within positions, | |
| >>> Location.fromstring("(9.10)..(20.25)", 1000) | |
| SimpleLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) | |
| Notice how that will act as though it has overall start 8 and end 25. | |
| Zero length between feature, | |
| >>> Location.fromstring("123^124", 1000) | |
| SimpleLocation(ExactPosition(123), ExactPosition(123), strand=1) | |
| The expected sequence length is needed for a special case, a between | |
| position at the start/end of a circular genome: | |
| >>> Location.fromstring("1000^1", 1000) | |
| SimpleLocation(ExactPosition(1000), ExactPosition(1000), strand=1) | |
| Apart from this special case, between positions P^Q must have P+1==Q, | |
| >>> Location.fromstring("123^456", 1000) | |
| Traceback (most recent call last): | |
| ... | |
| Bio.SeqFeature.LocationParserError: invalid feature location '123^456' | |
| You can optionally provide a reference name: | |
| >>> Location.fromstring("AL391218.9:105173..108462", 2000000) | |
| SimpleLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9') | |
| >>> Location.fromstring("<2644..159", 2868, "circular") | |
| CompoundLocation([SimpleLocation(BeforePosition(2643), ExactPosition(2868), strand=1), SimpleLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join') | |
| """ | |
| if text.startswith("complement("): | |
| if text[-1] != ")": | |
| raise ValueError(f"closing bracket missing in '{text}'") | |
| text = text[11:-1] | |
| strand = -1 | |
| elif stranded: | |
| strand = 1 | |
| else: | |
| strand = None | |
| # Determine if we have a simple location or a compound location | |
| if text.startswith("join("): | |
| operator = "join" | |
| parts = _split(text[5:-1])[1::2] | |
| # assert parts[0] == "" and parts[-1] == "" | |
| elif text.startswith("order("): | |
| operator = "order" | |
| parts = _split(text[6:-1])[1::2] | |
| # assert parts[0] == "" and parts[-1] == "" | |
| elif text.startswith("bond("): | |
| operator = "bond" | |
| parts = _split(text[5:-1])[1::2] | |
| # assert parts[0] == "" and parts[-1] == "" | |
| else: | |
| loc = SimpleLocation.fromstring(text, length, circular) | |
| loc.strand = strand | |
| if strand == -1: | |
| loc.parts.reverse() | |
| return loc | |
| locs = [] | |
| for part in parts: | |
| loc = SimpleLocation.fromstring(part, length, circular) | |
| if loc is None: | |
| break | |
| if loc.strand == -1: | |
| if strand == -1: | |
| raise LocationParserError("double complement in '{text}'?") | |
| else: | |
| loc.strand = strand | |
| locs.extend(loc.parts) | |
| else: | |
| if len(locs) == 1: | |
| return loc | |
| # Historically a join on the reverse strand has been represented | |
| # in Biopython with both the parent SeqFeature and its children | |
| # (the exons for a CDS) all given a strand of -1. Likewise, for | |
| # a join feature on the forward strand they all have strand +1. | |
| # However, we must also consider evil mixed strand examples like | |
| # this, join(complement(69611..69724),139856..140087,140625..140650) | |
| if strand == -1: | |
| # Whole thing was wrapped in complement(...) | |
| for loc in locs: | |
| assert loc.strand == -1 | |
| # Reverse the backwards order used in GenBank files | |
| # with complement(join(...)) | |
| locs = locs[::-1] | |
| return CompoundLocation(locs, operator=operator) | |
| # Not recognized | |
| if "order" in text and "join" in text: | |
| # See Bug 3197 | |
| raise LocationParserError( | |
| f"failed to parse feature location '{text}' containing a combination of 'join' and 'order' (nested operators) are illegal" | |
| ) | |
| # See issue #937. Note that NCBI has already fixed this record. | |
| if ",)" in text: | |
| warnings.warn( | |
| "Dropping trailing comma in malformed feature location", | |
| BiopythonParserWarning, | |
| ) | |
| text = text.replace(",)", ")") | |
| return Location.fromstring(text) | |
| raise LocationParserError(f"failed to parse feature location '{text}'") | |
| class SimpleLocation(Location): | |
| """Specify the location of a feature along a sequence. | |
| The SimpleLocation is used for simple continuous features, which can | |
| be described as running from a start position to and end position | |
| (optionally with a strand and reference information). More complex | |
| locations made up from several non-continuous parts (e.g. a coding | |
| sequence made up of several exons) are described using a SeqFeature | |
| with a CompoundLocation. | |
| Note that the start and end location numbering follow Python's scheme, | |
| thus a GenBank entry of 123..150 (one based counting) becomes a location | |
| of [122:150] (zero based counting). | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> f = SimpleLocation(122, 150) | |
| >>> print(f) | |
| [122:150] | |
| >>> print(f.start) | |
| 122 | |
| >>> print(f.end) | |
| 150 | |
| >>> print(f.strand) | |
| None | |
| Note the strand defaults to None. If you are working with nucleotide | |
| sequences you'd want to be explicit if it is the forward strand: | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> f = SimpleLocation(122, 150, strand=+1) | |
| >>> print(f) | |
| [122:150](+) | |
| >>> print(f.strand) | |
| 1 | |
| Note that for a parent sequence of length n, the SimpleLocation | |
| start and end must satisfy the inequality 0 <= start <= end <= n. | |
| This means even for features on the reverse strand of a nucleotide | |
| sequence, we expect the 'start' coordinate to be less than the | |
| 'end'. | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> r = SimpleLocation(122, 150, strand=-1) | |
| >>> print(r) | |
| [122:150](-) | |
| >>> print(r.start) | |
| 122 | |
| >>> print(r.end) | |
| 150 | |
| >>> print(r.strand) | |
| -1 | |
| i.e. Rather than thinking of the 'start' and 'end' biologically in a | |
| strand aware manner, think of them as the 'left most' or 'minimum' | |
| boundary, and the 'right most' or 'maximum' boundary of the region | |
| being described. This is particularly important with compound | |
| locations describing non-continuous regions. | |
| In the example above we have used standard exact positions, but there | |
| are also specialised position objects used to represent fuzzy positions | |
| as well, for example a GenBank location like complement(<123..150) | |
| would use a BeforePosition object for the start. | |
| """ | |
| def __init__(self, start, end, strand=None, ref=None, ref_db=None): | |
| """Initialize the class. | |
| start and end arguments specify the values where the feature begins | |
| and ends. These can either by any of the ``*Position`` objects that | |
| inherit from Position, or can just be integers specifying the position. | |
| In the case of integers, the values are assumed to be exact and are | |
| converted in ExactPosition arguments. This is meant to make it easy | |
| to deal with non-fuzzy ends. | |
| i.e. Short form: | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> loc = SimpleLocation(5, 10, strand=-1) | |
| >>> print(loc) | |
| [5:10](-) | |
| Explicit form: | |
| >>> from Bio.SeqFeature import SimpleLocation, ExactPosition | |
| >>> loc = SimpleLocation(ExactPosition(5), ExactPosition(10), strand=-1) | |
| >>> print(loc) | |
| [5:10](-) | |
| Other fuzzy positions are used similarly, | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
| >>> loc2 = SimpleLocation(BeforePosition(5), AfterPosition(10), strand=-1) | |
| >>> print(loc2) | |
| [<5:>10](-) | |
| For nucleotide features you will also want to specify the strand, | |
| use 1 for the forward (plus) strand, -1 for the reverse (negative) | |
| strand, 0 for stranded but strand unknown (? in GFF3), or None for | |
| when the strand does not apply (dot in GFF3), e.g. features on | |
| proteins. | |
| >>> loc = SimpleLocation(5, 10, strand=+1) | |
| >>> print(loc) | |
| [5:10](+) | |
| >>> print(loc.strand) | |
| 1 | |
| Normally feature locations are given relative to the parent | |
| sequence you are working with, but an explicit accession can | |
| be given with the optional ref and db_ref strings: | |
| >>> loc = SimpleLocation(105172, 108462, ref="AL391218.9", strand=1) | |
| >>> print(loc) | |
| AL391218.9[105172:108462](+) | |
| >>> print(loc.ref) | |
| AL391218.9 | |
| """ | |
| # TODO - Check 0 <= start <= end (<= length of reference) | |
| if isinstance(start, Position): | |
| self._start = start | |
| elif isinstance(start, int): | |
| self._start = ExactPosition(start) | |
| else: | |
| raise TypeError(f"start={start!r} {type(start)}") | |
| if isinstance(end, Position): | |
| self._end = end | |
| elif isinstance(end, int): | |
| self._end = ExactPosition(end) | |
| else: | |
| raise TypeError(f"end={end!r} {type(end)}") | |
| if ( | |
| isinstance(self.start, int) | |
| and isinstance(self.end, int) | |
| and self.start > self.end | |
| ): | |
| raise ValueError( | |
| f"End location ({self.end}) must be greater than " | |
| f"or equal to start location ({self.start})" | |
| ) | |
| self.strand = strand | |
| self.ref = ref | |
| self.ref_db = ref_db | |
| def fromstring(text, length=None, circular=False): | |
| """Create a SimpleLocation object from a string.""" | |
| if text.startswith("complement("): | |
| text = text[11:-1] | |
| strand = -1 | |
| else: | |
| strand = None | |
| # Try simple cases first for speed | |
| try: | |
| s, e = text.split("..") | |
| s = int(s) - 1 | |
| e = int(e) | |
| except ValueError: | |
| pass | |
| else: | |
| if 0 <= s <= e: | |
| return SimpleLocation(s, e, strand) | |
| # Try general case | |
| try: | |
| ref, text = text.split(":") | |
| except ValueError: | |
| ref = None | |
| m = _re_location_category.match(text) | |
| if m is None: | |
| return None | |
| for key, value in m.groupdict().items(): | |
| if value is not None: | |
| break | |
| assert value == text | |
| if key == "bond": | |
| # e.g. bond(196) | |
| warnings.warn( | |
| "Dropping bond qualifier in feature location", | |
| BiopythonParserWarning, | |
| ) | |
| text = text[5:-1] | |
| s_pos = Position.fromstring(text, -1) | |
| e_pos = Position.fromstring(text) | |
| elif key == "solo": | |
| # e.g. "123" | |
| s_pos = Position.fromstring(text, -1) | |
| e_pos = Position.fromstring(text) | |
| elif key in ("pair", "within", "oneof"): | |
| s, e = text.split("..") | |
| # Attempt to fix features that span the origin | |
| s_pos = Position.fromstring(s, -1) | |
| e_pos = Position.fromstring(e) | |
| if s_pos > e_pos: | |
| # There is likely a problem with origin wrapping. | |
| # Create a CompoundLocation of the wrapped feature, | |
| # consisting of two SimpleLocation objects to extend to | |
| # the list of feature locations. | |
| if not circular: | |
| raise LocationParserError( | |
| f"it appears that '{text}' is a feature that spans the origin, but the sequence topology is undefined" | |
| ) | |
| warnings.warn( | |
| "Attempting to fix invalid location %r as " | |
| "it looks like incorrect origin wrapping. " | |
| "Please fix input file, this could have " | |
| "unintended behavior." % text, | |
| BiopythonParserWarning, | |
| ) | |
| f1 = SimpleLocation(s_pos, length, strand) | |
| f2 = SimpleLocation(0, e_pos, strand) | |
| if strand == -1: | |
| # For complementary features spanning the origin | |
| return f2 + f1 | |
| else: | |
| return f1 + f2 | |
| elif key == "between": | |
| # A between location like "67^68" (one based counting) is a | |
| # special case (note it has zero length). In python slice | |
| # notation this is 67:67, a zero length slice. See Bug 2622 | |
| # Further more, on a circular genome of length N you can have | |
| # a location N^1 meaning the junction at the origin. See Bug 3098. | |
| # NOTE - We can imagine between locations like "2^4", but this | |
| # is just "3". Similarly, "2^5" is just "3..4" | |
| s, e = text.split("^") | |
| s = int(s) | |
| e = int(e) | |
| if s + 1 == e or (s == length and e == 1): | |
| s_pos = ExactPosition(s) | |
| e_pos = s_pos | |
| else: | |
| raise LocationParserError(f"invalid feature location '{text}'") | |
| if s_pos < 0: | |
| raise LocationParserError( | |
| f"negative starting position in feature location '{text}'" | |
| ) | |
| return SimpleLocation(s_pos, e_pos, strand, ref=ref) | |
| def _get_strand(self): | |
| """Get function for the strand property (PRIVATE).""" | |
| return self._strand | |
| def _set_strand(self, value): | |
| """Set function for the strand property (PRIVATE).""" | |
| if value not in [+1, -1, 0, None]: | |
| raise ValueError(f"Strand should be +1, -1, 0 or None, not {value!r}") | |
| self._strand = value | |
| strand = property( | |
| fget=_get_strand, | |
| fset=_set_strand, | |
| doc="Strand of the location (+1, -1, 0 or None).", | |
| ) | |
| def __str__(self): | |
| """Return a representation of the SimpleLocation object (with python counting). | |
| For the simple case this uses the python splicing syntax, [122:150] | |
| (zero based counting) which GenBank would call 123..150 (one based | |
| counting). | |
| """ | |
| answer = f"[{self._start}:{self._end}]" | |
| if self.ref and self.ref_db: | |
| answer = f"{self.ref_db}:{self.ref}{answer}" | |
| elif self.ref: | |
| answer = self.ref + answer | |
| # Is ref_db without ref meaningful? | |
| if self.strand is None: | |
| return answer | |
| elif self.strand == +1: | |
| return answer + "(+)" | |
| elif self.strand == -1: | |
| return answer + "(-)" | |
| else: | |
| # strand = 0, stranded but strand unknown, ? in GFF3 | |
| return answer + "(?)" | |
| def __repr__(self): | |
| """Represent the SimpleLocation object as a string for debugging.""" | |
| optional = "" | |
| if self.strand is not None: | |
| optional += f", strand={self.strand!r}" | |
| if self.ref is not None: | |
| optional += f", ref={self.ref!r}" | |
| if self.ref_db is not None: | |
| optional += f", ref_db={self.ref_db!r}" | |
| return f"{self.__class__.__name__}({self.start!r}, {self.end!r}{optional})" | |
| def __add__(self, other): | |
| """Combine location with another SimpleLocation object, or shift it. | |
| You can add two feature locations to make a join CompoundLocation: | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> f1 = SimpleLocation(5, 10) | |
| >>> f2 = SimpleLocation(20, 30) | |
| >>> combined = f1 + f2 | |
| >>> print(combined) | |
| join{[5:10], [20:30]} | |
| This is thus equivalent to: | |
| >>> from Bio.SeqFeature import CompoundLocation | |
| >>> join = CompoundLocation([f1, f2]) | |
| >>> print(join) | |
| join{[5:10], [20:30]} | |
| You can also use sum(...) in this way: | |
| >>> join = sum([f1, f2]) | |
| >>> print(join) | |
| join{[5:10], [20:30]} | |
| Furthermore, you can combine a SimpleLocation with a CompoundLocation | |
| in this way. | |
| Separately, adding an integer will give a new SimpleLocation with | |
| its start and end offset by that amount. For example: | |
| >>> print(f1) | |
| [5:10] | |
| >>> print(f1 + 100) | |
| [105:110] | |
| >>> print(200 + f1) | |
| [205:210] | |
| This can be useful when editing annotation. | |
| """ | |
| if isinstance(other, SimpleLocation): | |
| return CompoundLocation([self, other]) | |
| elif isinstance(other, int): | |
| return self._shift(other) | |
| else: | |
| # This will allow CompoundLocation's __radd__ to be called: | |
| return NotImplemented | |
| def __radd__(self, other): | |
| """Return a SimpleLocation object by shifting the location by an integer amount.""" | |
| if isinstance(other, int): | |
| return self._shift(other) | |
| else: | |
| return NotImplemented | |
| def __sub__(self, other): | |
| """Subtracting an integer will shift the start and end by that amount. | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> f1 = SimpleLocation(105, 150) | |
| >>> print(f1) | |
| [105:150] | |
| >>> print(f1 - 100) | |
| [5:50] | |
| This can be useful when editing annotation. You can also add an integer | |
| to a feature location (which shifts in the opposite direction). | |
| """ | |
| if isinstance(other, int): | |
| return self._shift(-other) | |
| else: | |
| return NotImplemented | |
| def __nonzero__(self): | |
| """Return True regardless of the length of the feature. | |
| This behavior is for backwards compatibility, since until the | |
| __len__ method was added, a SimpleLocation always evaluated as True. | |
| Note that in comparison, Seq objects, strings, lists, etc, will all | |
| evaluate to False if they have length zero. | |
| WARNING: The SimpleLocation may in future evaluate to False when its | |
| length is zero (in order to better match normal python behavior)! | |
| """ | |
| return True | |
| def __len__(self): | |
| """Return the length of the region described by the SimpleLocation object. | |
| Note that extra care may be needed for fuzzy locations, e.g. | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
| >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10)) | |
| >>> len(loc) | |
| 5 | |
| """ | |
| return int(self._end) - int(self._start) | |
| def __contains__(self, value): | |
| """Check if an integer position is within the SimpleLocation object. | |
| Note that extra care may be needed for fuzzy locations, e.g. | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
| >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10)) | |
| >>> len(loc) | |
| 5 | |
| >>> [i for i in range(15) if i in loc] | |
| [5, 6, 7, 8, 9] | |
| """ | |
| if not isinstance(value, int): | |
| raise ValueError( | |
| "Currently we only support checking for integer " | |
| "positions being within a SimpleLocation." | |
| ) | |
| if value < self._start or value >= self._end: | |
| return False | |
| else: | |
| return True | |
| def __iter__(self): | |
| """Iterate over the parent positions within the SimpleLocation object. | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> from Bio.SeqFeature import BeforePosition, AfterPosition | |
| >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10)) | |
| >>> len(loc) | |
| 5 | |
| >>> for i in loc: print(i) | |
| 5 | |
| 6 | |
| 7 | |
| 8 | |
| 9 | |
| >>> list(loc) | |
| [5, 6, 7, 8, 9] | |
| >>> [i for i in range(15) if i in loc] | |
| [5, 6, 7, 8, 9] | |
| Note this is strand aware: | |
| >>> loc = SimpleLocation(BeforePosition(5), AfterPosition(10), strand = -1) | |
| >>> list(loc) | |
| [9, 8, 7, 6, 5] | |
| """ | |
| if self.strand == -1: | |
| yield from range(self._end - 1, self._start - 1, -1) | |
| else: | |
| yield from range(self._start, self._end) | |
| def __eq__(self, other): | |
| """Implement equality by comparing all the location attributes.""" | |
| if not isinstance(other, SimpleLocation): | |
| return False | |
| return ( | |
| self._start == other.start | |
| and self._end == other.end | |
| and self._strand == other.strand | |
| and self.ref == other.ref | |
| and self.ref_db == other.ref_db | |
| ) | |
| def _shift(self, offset): | |
| """Return a copy of the SimpleLocation shifted by an offset (PRIVATE). | |
| Returns self when location is relative to an external reference. | |
| """ | |
| # TODO - What if offset is a fuzzy position? | |
| if self.ref or self.ref_db: | |
| return self | |
| return SimpleLocation( | |
| start=self._start + offset, | |
| end=self._end + offset, | |
| strand=self.strand, | |
| ) | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE). | |
| Returns self when location is relative to an external reference. | |
| """ | |
| if self.ref or self.ref_db: | |
| return self | |
| # Note this will flip the start and end too! | |
| if self.strand == +1: | |
| flip_strand = -1 | |
| elif self.strand == -1: | |
| flip_strand = +1 | |
| else: | |
| # 0 or None | |
| flip_strand = self.strand | |
| return SimpleLocation( | |
| start=self._end._flip(length), | |
| end=self._start._flip(length), | |
| strand=flip_strand, | |
| ) | |
| def parts(self): | |
| """Read only list of sections (always one, the SimpleLocation object). | |
| This is a convenience property allowing you to write code handling | |
| both SimpleLocation objects (with one part) and more complex | |
| CompoundLocation objects (with multiple parts) interchangeably. | |
| """ | |
| return [self] | |
| def start(self): | |
| """Start location - left most (minimum) value, regardless of strand. | |
| Read only, returns an integer like position object, possibly a fuzzy | |
| position. | |
| """ | |
| return self._start | |
| def end(self): | |
| """End location - right most (maximum) value, regardless of strand. | |
| Read only, returns an integer like position object, possibly a fuzzy | |
| position. | |
| """ | |
| return self._end | |
| def nofuzzy_start(self): | |
| """Start position (integer, approximated if fuzzy, read only) (DEPRECATED). | |
| This is now an alias for int(feature.start), which should be | |
| used in preference -- unless you are trying to support old | |
| versions of Biopython. | |
| """ | |
| warnings.warn( | |
| "Use int(feature.start) rather than feature.nofuzzy_start", | |
| BiopythonDeprecationWarning, | |
| ) | |
| try: | |
| return int(self._start) | |
| except TypeError: | |
| if isinstance(self._start, UnknownPosition): | |
| return None | |
| raise | |
| def nofuzzy_end(self): | |
| """End position (integer, approximated if fuzzy, read only) (DEPRECATED). | |
| This is now an alias for int(feature.end), which should be | |
| used in preference -- unless you are trying to support old | |
| versions of Biopython. | |
| """ | |
| warnings.warn( | |
| "Use int(feature.end) rather than feature.nofuzzy_end", | |
| BiopythonDeprecationWarning, | |
| ) | |
| try: | |
| return int(self._end) | |
| except TypeError: | |
| if isinstance(self._end, UnknownPosition): | |
| return None | |
| raise | |
| def extract(self, parent_sequence, references=None): | |
| """Extract the sequence from supplied parent sequence using the SimpleLocation object. | |
| The parent_sequence can be a Seq like object or a string, and will | |
| generally return an object of the same type. The exception to this is | |
| a MutableSeq as the parent sequence will return a Seq object. | |
| If the location refers to other records, they must be supplied | |
| in the optional dictionary references. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
| >>> feature_loc = SimpleLocation(8, 15) | |
| >>> feature_loc.extract(seq) | |
| Seq('VALIVIC') | |
| """ | |
| if self.ref or self.ref_db: | |
| if not references: | |
| raise ValueError( | |
| f"Feature references another sequence ({self.ref})," | |
| " references mandatory" | |
| ) | |
| elif self.ref not in references: | |
| # KeyError? | |
| raise ValueError( | |
| f"Feature references another sequence ({self.ref})," | |
| " not found in references" | |
| ) | |
| parent_sequence = references[self.ref] | |
| f_seq = parent_sequence[int(self.start) : int(self.end)] | |
| if isinstance(f_seq, MutableSeq): | |
| f_seq = Seq(f_seq) | |
| if self.strand == -1: | |
| f_seq = reverse_complement( | |
| f_seq, inplace=False | |
| ) # TODO: remove inplace=False | |
| return f_seq | |
| FeatureLocation = SimpleLocation # OBSOLETE; for backward compatability only. | |
| class CompoundLocation(Location): | |
| """For handling joins etc where a feature location has several parts.""" | |
| def __init__(self, parts, operator="join"): | |
| """Initialize the class. | |
| >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation | |
| >>> f1 = SimpleLocation(10, 40, strand=+1) | |
| >>> f2 = SimpleLocation(50, 59, strand=+1) | |
| >>> f = CompoundLocation([f1, f2]) | |
| >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) | |
| True | |
| >>> print(f.operator) | |
| join | |
| >>> 5 in f | |
| False | |
| >>> 15 in f | |
| True | |
| >>> f.strand | |
| 1 | |
| Notice that the strand of the compound location is computed | |
| automatically - in the case of mixed strands on the sub-locations | |
| the overall strand is set to None. | |
| >>> f = CompoundLocation([SimpleLocation(3, 6, strand=+1), | |
| ... SimpleLocation(10, 13, strand=-1)]) | |
| >>> print(f.strand) | |
| None | |
| >>> len(f) | |
| 6 | |
| >>> list(f) | |
| [3, 4, 5, 12, 11, 10] | |
| The example above doing list(f) iterates over the coordinates within the | |
| feature. This allows you to use max and min on the location, to find the | |
| range covered: | |
| >>> min(f) | |
| 3 | |
| >>> max(f) | |
| 12 | |
| More generally, you can use the compound location's start and end which | |
| give the full span covered, 0 <= start <= end <= full sequence length. | |
| >>> f.start == min(f) | |
| True | |
| >>> f.end == max(f) + 1 | |
| True | |
| This is consistent with the behavior of the SimpleLocation for a single | |
| region, where again the 'start' and 'end' do not necessarily give the | |
| biological start and end, but rather the 'minimal' and 'maximal' | |
| coordinate boundaries. | |
| Note that adding locations provides a more intuitive method of | |
| construction: | |
| >>> f = SimpleLocation(3, 6, strand=+1) + SimpleLocation(10, 13, strand=-1) | |
| >>> len(f) | |
| 6 | |
| >>> list(f) | |
| [3, 4, 5, 12, 11, 10] | |
| """ | |
| self.operator = operator | |
| self.parts = list(parts) | |
| for loc in self.parts: | |
| if not isinstance(loc, SimpleLocation): | |
| raise ValueError( | |
| "CompoundLocation should be given a list of " | |
| "SimpleLocation objects, not %s" % loc.__class__ | |
| ) | |
| if len(parts) < 2: | |
| raise ValueError( | |
| f"CompoundLocation should have at least 2 parts, not {parts!r}" | |
| ) | |
| def __str__(self): | |
| """Return a representation of the CompoundLocation object (with python counting).""" | |
| return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts)) | |
| def __repr__(self): | |
| """Represent the CompoundLocation object as string for debugging.""" | |
| return f"{self.__class__.__name__}({self.parts!r}, {self.operator!r})" | |
| def _get_strand(self): | |
| """Get function for the strand property (PRIVATE).""" | |
| # Historically a join on the reverse strand has been represented | |
| # in Biopython with both the parent SeqFeature and its children | |
| # (the exons for a CDS) all given a strand of -1. Likewise, for | |
| # a join feature on the forward strand they all have strand +1. | |
| # However, we must also consider evil mixed strand examples like | |
| # this, join(complement(69611..69724),139856..140087,140625..140650) | |
| if len({loc.strand for loc in self.parts}) == 1: | |
| return self.parts[0].strand | |
| else: | |
| return None # i.e. mixed strands | |
| def _set_strand(self, value): | |
| """Set function for the strand property (PRIVATE).""" | |
| # Should this be allowed/encouraged? | |
| for loc in self.parts: | |
| loc.strand = value | |
| strand = property( | |
| fget=_get_strand, | |
| fset=_set_strand, | |
| doc="""Overall strand of the compound location. | |
| If all the parts have the same strand, that is returned. Otherwise | |
| for mixed strands, this returns None. | |
| >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation | |
| >>> f1 = SimpleLocation(15, 17, strand=1) | |
| >>> f2 = SimpleLocation(20, 30, strand=-1) | |
| >>> f = f1 + f2 | |
| >>> f1.strand | |
| 1 | |
| >>> f2.strand | |
| -1 | |
| >>> f.strand | |
| >>> f.strand is None | |
| True | |
| If you set the strand of a CompoundLocation, this is applied to | |
| all the parts - use with caution: | |
| >>> f.strand = 1 | |
| >>> f1.strand | |
| 1 | |
| >>> f2.strand | |
| 1 | |
| >>> f.strand | |
| 1 | |
| """, | |
| ) | |
| def __add__(self, other): | |
| """Combine locations, or shift the location by an integer offset. | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> f1 = SimpleLocation(15, 17) + SimpleLocation(20, 30) | |
| >>> print(f1) | |
| join{[15:17], [20:30]} | |
| You can add another SimpleLocation: | |
| >>> print(f1 + SimpleLocation(40, 50)) | |
| join{[15:17], [20:30], [40:50]} | |
| >>> print(SimpleLocation(5, 10) + f1) | |
| join{[5:10], [15:17], [20:30]} | |
| You can also add another CompoundLocation: | |
| >>> f2 = SimpleLocation(40, 50) + SimpleLocation(60, 70) | |
| >>> print(f2) | |
| join{[40:50], [60:70]} | |
| >>> print(f1 + f2) | |
| join{[15:17], [20:30], [40:50], [60:70]} | |
| Also, as with the SimpleLocation, adding an integer shifts the | |
| location's coordinates by that offset: | |
| >>> print(f1 + 100) | |
| join{[115:117], [120:130]} | |
| >>> print(200 + f1) | |
| join{[215:217], [220:230]} | |
| >>> print(f1 + (-5)) | |
| join{[10:12], [15:25]} | |
| """ | |
| if isinstance(other, SimpleLocation): | |
| return CompoundLocation(self.parts + [other], self.operator) | |
| elif isinstance(other, CompoundLocation): | |
| if self.operator != other.operator: | |
| # Handle join+order -> order as a special case? | |
| raise ValueError( | |
| f"Mixed operators {self.operator} and {other.operator}" | |
| ) | |
| return CompoundLocation(self.parts + other.parts, self.operator) | |
| elif isinstance(other, int): | |
| return self._shift(other) | |
| else: | |
| raise NotImplementedError | |
| def __radd__(self, other): | |
| """Add a feature to the left.""" | |
| if isinstance(other, SimpleLocation): | |
| return CompoundLocation([other] + self.parts, self.operator) | |
| elif isinstance(other, int): | |
| return self._shift(other) | |
| else: | |
| raise NotImplementedError | |
| def __contains__(self, value): | |
| """Check if an integer position is within the CompoundLocation object.""" | |
| for loc in self.parts: | |
| if value in loc: | |
| return True | |
| return False | |
| def __nonzero__(self): | |
| """Return True regardless of the length of the feature. | |
| This behavior is for backwards compatibility, since until the | |
| __len__ method was added, a SimpleLocation always evaluated as True. | |
| Note that in comparison, Seq objects, strings, lists, etc, will all | |
| evaluate to False if they have length zero. | |
| WARNING: The SimpleLocation may in future evaluate to False when its | |
| length is zero (in order to better match normal python behavior)! | |
| """ | |
| return True | |
| def __len__(self): | |
| """Return the length of the CompoundLocation object.""" | |
| return sum(len(loc) for loc in self.parts) | |
| def __iter__(self): | |
| """Iterate over the parent positions within the CompoundLocation object.""" | |
| for loc in self.parts: | |
| yield from loc | |
| def __eq__(self, other): | |
| """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation.""" | |
| if not isinstance(other, CompoundLocation): | |
| return False | |
| if len(self.parts) != len(other.parts): | |
| return False | |
| if self.operator != other.operator: | |
| return False | |
| for self_part, other_part in zip(self.parts, other.parts): | |
| if self_part != other_part: | |
| return False | |
| return True | |
| def _shift(self, offset): | |
| """Return a copy of the CompoundLocation shifted by an offset (PRIVATE).""" | |
| return CompoundLocation( | |
| [loc._shift(offset) for loc in self.parts], self.operator | |
| ) | |
| def _flip(self, length): | |
| """Return a copy of the locations after the parent is reversed (PRIVATE). | |
| Note that the order of the parts is NOT reversed too. Consider a CDS | |
| on the forward strand with exons small, medium and large (in length). | |
| Once we change the frame of reference to the reverse complement strand, | |
| the start codon is still part of the small exon, and the stop codon | |
| still part of the large exon - so the part order remains the same! | |
| Here is an artificial example, were the features map to the two upper | |
| case regions and the lower case runs of n are not used: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqFeature import SimpleLocation | |
| >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") | |
| >>> small = SimpleLocation(5, 20, strand=1) | |
| >>> large = SimpleLocation(28, 52, strand=1) | |
| >>> location = small + large | |
| >>> print(small) | |
| [5:20](+) | |
| >>> print(large) | |
| [28:52](+) | |
| >>> print(location) | |
| join{[5:20](+), [28:52](+)} | |
| >>> for part in location.parts: | |
| ... print(len(part)) | |
| ... | |
| 15 | |
| 24 | |
| As you can see, this is a silly example where each "exon" is a word: | |
| >>> print(small.extract(dna).translate()) | |
| SILLY | |
| >>> print(large.extract(dna).translate()) | |
| EXAMPLE* | |
| >>> print(location.extract(dna).translate()) | |
| SILLYEXAMPLE* | |
| >>> for part in location.parts: | |
| ... print(part.extract(dna).translate()) | |
| ... | |
| SILLY | |
| EXAMPLE* | |
| Now, let's look at this from the reverse strand frame of reference: | |
| >>> flipped_dna = dna.reverse_complement() | |
| >>> flipped_location = location._flip(len(dna)) | |
| >>> print(flipped_location.extract(flipped_dna).translate()) | |
| SILLYEXAMPLE* | |
| >>> for part in flipped_location.parts: | |
| ... print(part.extract(flipped_dna).translate()) | |
| ... | |
| SILLY | |
| EXAMPLE* | |
| The key point here is the first part of the CompoundFeature is still the | |
| small exon, while the second part is still the large exon: | |
| >>> for part in flipped_location.parts: | |
| ... print(len(part)) | |
| ... | |
| 15 | |
| 24 | |
| >>> print(flipped_location) | |
| join{[37:52](-), [5:29](-)} | |
| Notice the parts are not reversed. However, there was a bug here in older | |
| versions of Biopython which would have given join{[5:29](-), [37:52](-)} | |
| and the translation would have wrongly been "EXAMPLE*SILLY" instead. | |
| """ | |
| return CompoundLocation( | |
| [loc._flip(length) for loc in self.parts], self.operator | |
| ) | |
| def start(self): | |
| """Start location - left most (minimum) value, regardless of strand. | |
| Read only, returns an integer like position object, possibly a fuzzy | |
| position. | |
| For the special case of a CompoundLocation wrapping the origin of a | |
| circular genome, this will return zero. | |
| """ | |
| return min(loc.start for loc in self.parts) | |
| def end(self): | |
| """End location - right most (maximum) value, regardless of strand. | |
| Read only, returns an integer like position object, possibly a fuzzy | |
| position. | |
| For the special case of a CompoundLocation wrapping the origin of | |
| a circular genome this will match the genome length (minus one | |
| given how Python counts from zero). | |
| """ | |
| return max(loc.end for loc in self.parts) | |
| def nofuzzy_start(self): | |
| """Start position (integer, approximated if fuzzy, read only) (DEPRECATED). | |
| This is an alias for int(feature.start), which should be used in | |
| preference -- unless you are trying to support old versions of | |
| Biopython. | |
| """ | |
| warnings.warn( | |
| "Use int(feature.start) rather than feature.nofuzzy_start", | |
| BiopythonDeprecationWarning, | |
| ) | |
| try: | |
| return int(self.start) | |
| except TypeError: | |
| if isinstance(self.start, UnknownPosition): | |
| return None | |
| raise | |
| def nofuzzy_end(self): | |
| """End position (integer, approximated if fuzzy, read only) (DEPRECATED). | |
| This is an alias for int(feature.end), which should be used in | |
| preference -- unless you are trying to support old versions of | |
| Biopython. | |
| """ | |
| warnings.warn( | |
| "Use int(feature.end) rather than feature.nofuzzy_end", | |
| BiopythonDeprecationWarning, | |
| ) | |
| try: | |
| return int(self.end) | |
| except TypeError: | |
| if isinstance(self.end, UnknownPosition): | |
| return None | |
| raise | |
| def ref(self): | |
| """Not present in CompoundLocation, dummy method for API compatibility.""" | |
| return None | |
| def ref_db(self): | |
| """Not present in CompoundLocation, dummy method for API compatibility.""" | |
| return None | |
| def extract(self, parent_sequence, references=None): | |
| """Extract the sequence from supplied parent sequence using the CompoundLocation object. | |
| The parent_sequence can be a Seq like object or a string, and will | |
| generally return an object of the same type. The exception to this is | |
| a MutableSeq as the parent sequence will return a Seq object. | |
| If the location refers to other records, they must be supplied | |
| in the optional dictionary references. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqFeature import SimpleLocation, CompoundLocation | |
| >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") | |
| >>> fl1 = SimpleLocation(2, 8) | |
| >>> fl2 = SimpleLocation(10, 15) | |
| >>> fl3 = CompoundLocation([fl1,fl2]) | |
| >>> fl3.extract(seq) | |
| Seq('QHKAMILIVIC') | |
| """ | |
| # This copes with mixed strand features & all on reverse: | |
| parts = [ | |
| loc.extract(parent_sequence, references=references) for loc in self.parts | |
| ] | |
| f_seq = functools.reduce(lambda x, y: x + y, parts) | |
| return f_seq | |
| class Position(ABC): | |
| """Abstract base class representing a position.""" | |
| def __repr__(self): | |
| """Represent the Position object as a string for debugging.""" | |
| return f"{self.__class__.__name__}(...)" | |
| def position(self): | |
| """Legacy attribute to get (left-most) position as an integer (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.position is deprecated and will be removed in a future " | |
| "release. Use location directly, or int(location). However, that will " | |
| "fail for UnknownPosition, and for OneOfPosition and WithinPosition " | |
| "will give the default rather than left-most value.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return int(self) | |
| def extension(self): | |
| """Legacy attribute to get the position's 'width' as an integer, typically zero (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.extension is deprecated and will be removed in a " | |
| "future release. It was undefined or zero except for OneOfPosition, " | |
| "WithinPosition and WithinPosition which must now be handled " | |
| "explicitly instead.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return 0 | |
| def fromstring(text, offset=0): | |
| """Build a Position object from the text string. | |
| For an end position, leave offset as zero (default): | |
| >>> Position.fromstring("5") | |
| ExactPosition(5) | |
| For a start position, set offset to minus one (for Python counting): | |
| >>> Position.fromstring("5", -1) | |
| ExactPosition(4) | |
| This also covers fuzzy positions: | |
| >>> p = Position.fromstring("<5") | |
| >>> p | |
| BeforePosition(5) | |
| >>> print(p) | |
| <5 | |
| >>> int(p) | |
| 5 | |
| >>> Position.fromstring(">5") | |
| AfterPosition(5) | |
| By default assumes an end position, so note the integer behavior: | |
| >>> p = Position.fromstring("one-of(5,8,11)") | |
| >>> p | |
| OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) | |
| >>> print(p) | |
| one-of(5,8,11) | |
| >>> int(p) | |
| 11 | |
| >>> Position.fromstring("(8.10)") | |
| WithinPosition(10, left=8, right=10) | |
| Fuzzy start positions: | |
| >>> p = Position.fromstring("<5", -1) | |
| >>> p | |
| BeforePosition(4) | |
| >>> print(p) | |
| <4 | |
| >>> int(p) | |
| 4 | |
| Notice how the integer behavior changes too! | |
| >>> p = Position.fromstring("one-of(5,8,11)", -1) | |
| >>> p | |
| OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) | |
| >>> print(p) | |
| one-of(4,7,10) | |
| >>> int(p) | |
| 4 | |
| """ | |
| if offset != 0 and offset != -1: | |
| raise ValueError( | |
| "To convert one-based indices to zero-based indices, offset must be either 0 (for end positions) or -1 (for start positions)." | |
| ) | |
| if text == "?": | |
| return UnknownPosition() | |
| if text.startswith("?"): | |
| return UncertainPosition(int(text[1:]) + offset) | |
| if text.startswith("<"): | |
| return BeforePosition(int(text[1:]) + offset) | |
| if text.startswith(">"): | |
| return AfterPosition(int(text[1:]) + offset) | |
| m = _re_within_position.match(text) | |
| if m is not None: | |
| s, e = m.groups() | |
| s = int(s) + offset | |
| e = int(e) + offset | |
| if offset == -1: | |
| default = s | |
| else: | |
| default = e | |
| return WithinPosition(default, left=s, right=e) | |
| m = _re_oneof_position.match(text) | |
| if m is not None: | |
| positions = m.groups()[0] | |
| parts = [ExactPosition(int(pos) + offset) for pos in positions.split(",")] | |
| if offset == -1: | |
| default = min(int(pos) for pos in parts) | |
| else: | |
| default = max(int(pos) for pos in parts) | |
| return OneOfPosition(default, choices=parts) | |
| return ExactPosition(int(text) + offset) | |
| class ExactPosition(int, Position): | |
| """Specify the specific position of a boundary. | |
| Arguments: | |
| - position - The position of the boundary. | |
| - extension - An optional argument which must be zero since we don't | |
| have an extension. The argument is provided so that the same number | |
| of arguments can be passed to all position types. | |
| In this case, there is no fuzziness associated with the position. | |
| >>> p = ExactPosition(5) | |
| >>> p | |
| ExactPosition(5) | |
| >>> print(p) | |
| 5 | |
| >>> isinstance(p, Position) | |
| True | |
| >>> isinstance(p, int) | |
| True | |
| Integer comparisons and operations should work as expected: | |
| >>> p == 5 | |
| True | |
| >>> p < 6 | |
| True | |
| >>> p <= 5 | |
| True | |
| >>> p + 10 | |
| ExactPosition(15) | |
| """ | |
| def __new__(cls, position, extension=0): | |
| """Create an ExactPosition object.""" | |
| if extension != 0: | |
| raise AttributeError(f"Non-zero extension {extension} for exact position.") | |
| return int.__new__(cls, position) | |
| # Must define this on Python 3.8 onwards because we redefine __repr__ | |
| def __str__(self): | |
| """Return a representation of the ExactPosition object (with python counting).""" | |
| return str(int(self)) | |
| def __repr__(self): | |
| """Represent the ExactPosition object as a string for debugging.""" | |
| return "%s(%i)" % (self.__class__.__name__, int(self)) | |
| def __add__(self, offset): | |
| """Return a copy of the position object with its location shifted (PRIVATE).""" | |
| # By default preserve any subclass | |
| return self.__class__(int(self) + offset) | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
| # By default preserve any subclass | |
| return self.__class__(length - int(self)) | |
| class UncertainPosition(ExactPosition): | |
| """Specify a specific position which is uncertain. | |
| This is used in UniProt, e.g. ?222 for uncertain position 222, or in the | |
| XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. | |
| """ | |
| pass | |
| class UnknownPosition(Position): | |
| """Specify a specific position which is unknown (has no position). | |
| This is used in UniProt, e.g. ? or in the XML as unknown. | |
| """ | |
| def __repr__(self): | |
| """Represent the UnknownPosition object as a string for debugging.""" | |
| return f"{self.__class__.__name__}()" | |
| def __hash__(self): | |
| """Return the hash value of the UnknownPosition object.""" | |
| return hash(None) | |
| def position(self): | |
| """Legacy attribute to get location (None) (DEPRECATED). | |
| In general you can use the location directly as with the exception of | |
| UnknownPosition it subclasses int, or use int(location), rather than | |
| this location.position legacy attribute. | |
| However, the UnknownPosition cannot be cast to an integer, and thus | |
| does not subclass int, and int(...) will fail. The legacy attribute | |
| would return None instead. | |
| Note that while None == None, UnknownPosition() != UnknownPosition() | |
| which is like the behavour for NaN. | |
| """ | |
| warnings.warn( | |
| "Alias location.position is deprecated and will be removed in a future release. " | |
| "In general use position directly, but not note for UnknownPosition " | |
| "int(location) will fail. Use try/except or isinstance(location, UnknownPosition).", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return None | |
| def __add__(self, offset): | |
| """Return a copy of the position object with its location shifted (PRIVATE).""" | |
| return self | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
| return self | |
| class WithinPosition(int, Position): | |
| """Specify the position of a boundary within some coordinates. | |
| Arguments: | |
| - position - The default integer position | |
| - left - The start (left) position of the boundary | |
| - right - The end (right) position of the boundary | |
| This allows dealing with a location like ((11.14)..100). This | |
| indicates that the start of the sequence is somewhere between 11 | |
| and 14. Since this is a start coordinate, it should act like | |
| it is at position 11 (or in Python counting, 10). | |
| >>> p = WithinPosition(10, 10, 13) | |
| >>> p | |
| WithinPosition(10, left=10, right=13) | |
| >>> print(p) | |
| (10.13) | |
| >>> int(p) | |
| 10 | |
| Basic integer comparisons and operations should work as though | |
| this were a plain integer: | |
| >>> p == 10 | |
| True | |
| >>> p in [9, 10, 11] | |
| True | |
| >>> p < 11 | |
| True | |
| >>> p + 10 | |
| WithinPosition(20, left=20, right=23) | |
| >>> isinstance(p, WithinPosition) | |
| True | |
| >>> isinstance(p, Position) | |
| True | |
| >>> isinstance(p, int) | |
| True | |
| Note this also applies for comparison to other position objects, | |
| where again the integer behavior is used: | |
| >>> p == 10 | |
| True | |
| >>> p == ExactPosition(10) | |
| True | |
| >>> p == BeforePosition(10) | |
| True | |
| >>> p == AfterPosition(10) | |
| True | |
| If this were an end point, you would want the position to be 13 | |
| (the right/larger value, not the left/smaller value as above): | |
| >>> p2 = WithinPosition(13, 10, 13) | |
| >>> p2 | |
| WithinPosition(13, left=10, right=13) | |
| >>> print(p2) | |
| (10.13) | |
| >>> int(p2) | |
| 13 | |
| >>> p2 == 13 | |
| True | |
| >>> p2 == ExactPosition(13) | |
| True | |
| """ | |
| def __new__(cls, position, left, right): | |
| """Create a WithinPosition object.""" | |
| if not (position == left or position == right): | |
| raise RuntimeError( | |
| "WithinPosition: %r should match left %r or " | |
| "right %r" % (position, left, right) | |
| ) | |
| obj = int.__new__(cls, position) | |
| obj._left = left | |
| obj._right = right | |
| return obj | |
| def __getnewargs__(self): | |
| """Return the arguments accepted by __new__. | |
| Necessary to allow pickling and unpickling of class instances. | |
| """ | |
| return (int(self), self._left, self._right) | |
| def __repr__(self): | |
| """Represent the WithinPosition object as a string for debugging.""" | |
| return "%s(%i, left=%i, right=%i)" % ( | |
| self.__class__.__name__, | |
| int(self), | |
| self._left, | |
| self._right, | |
| ) | |
| def __str__(self): | |
| """Return a representation of the WithinPosition object (with python counting).""" | |
| return f"({self._left}.{self._right})" | |
| def position(self): | |
| """Legacy attribute to get (left) position as integer (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.position is deprecated and will be removed in a future release. " | |
| "Use location directly, or int(location) which will return the preferred location " | |
| "defined for WithinPosition (which may not be the left-most position).", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return self._left | |
| def extension(self): | |
| """Legacy attribute to get the within-position's 'width' as an integer (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.extension is deprecated and will be removed in a future release. " | |
| "This is usually zero, but there is no neat replacement for the WithinPosition object.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return self._right - self._left | |
| def __add__(self, offset): | |
| """Return a copy of the position object with its location shifted.""" | |
| return self.__class__( | |
| int(self) + offset, self._left + offset, self._right + offset | |
| ) | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
| return self.__class__( | |
| length - int(self), length - self._right, length - self._left | |
| ) | |
| class BetweenPosition(int, Position): | |
| """Specify the position of a boundary between two coordinates (OBSOLETE?). | |
| Arguments: | |
| - position - The default integer position | |
| - left - The start (left) position of the boundary | |
| - right - The end (right) position of the boundary | |
| This allows dealing with a position like 123^456. This | |
| indicates that the start of the sequence is somewhere between | |
| 123 and 456. It is up to the parser to set the position argument | |
| to either boundary point (depending on if this is being used as | |
| a start or end of the feature). For example as a feature end: | |
| >>> p = BetweenPosition(456, 123, 456) | |
| >>> p | |
| BetweenPosition(456, left=123, right=456) | |
| >>> print(p) | |
| (123^456) | |
| >>> int(p) | |
| 456 | |
| Integer equality and comparison use the given position, | |
| >>> p == 456 | |
| True | |
| >>> p in [455, 456, 457] | |
| True | |
| >>> p > 300 | |
| True | |
| The old legacy properties of position and extension give the | |
| starting/lower/left position as an integer, and the distance | |
| to the ending/higher/right position as an integer. Note that | |
| the position object will act like either the left or the right | |
| end-point depending on how it was created: | |
| >>> p2 = BetweenPosition(123, left=123, right=456) | |
| >>> int(p) == int(p2) | |
| False | |
| >>> p == 456 | |
| True | |
| >>> p2 == 123 | |
| True | |
| Note this potentially surprising behavior: | |
| >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) | |
| True | |
| >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) | |
| True | |
| >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) | |
| True | |
| i.e. For equality (and sorting) the position objects behave like | |
| integers. | |
| """ | |
| def __new__(cls, position, left, right): | |
| """Create a new instance in BetweenPosition object.""" | |
| assert position == left or position == right | |
| # TODO - public API for getting left/right, especially the unknown one | |
| obj = int.__new__(cls, position) | |
| obj._left = left | |
| obj._right = right | |
| return obj | |
| def __getnewargs__(self): | |
| """Return the arguments accepted by __new__. | |
| Necessary to allow pickling and unpickling of class instances. | |
| """ | |
| return (int(self), self._left, self._right) | |
| def __repr__(self): | |
| """Represent the BetweenPosition object as a string for debugging.""" | |
| return "%s(%i, left=%i, right=%i)" % ( | |
| self.__class__.__name__, | |
| int(self), | |
| self._left, | |
| self._right, | |
| ) | |
| def __str__(self): | |
| """Return a representation of the BetweenPosition object (with python counting).""" | |
| return f"({self._left}^{self._right})" | |
| def position(self): | |
| """Legacy attribute to get (left) position as integer (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.position is deprecated and will be removed in a future release. " | |
| "Use location directly, or int(location) which will return the preferred location " | |
| "defined for a BetweenPosition (which may not be the left-most position).", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return self._left | |
| def extension(self): | |
| """Legacy attribute to get the between-position's 'width' as an integer (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.extension is deprecated and will be removed in a future release. " | |
| "This is usually zero, but there is no neat replacement for the BetweenPosition object.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return self._right - self._left | |
| def __add__(self, offset): | |
| """Return a copy of the position object with its location shifted (PRIVATE).""" | |
| return self.__class__( | |
| int(self) + offset, self._left + offset, self._right + offset | |
| ) | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
| return self.__class__( | |
| length - int(self), length - self._right, length - self._left | |
| ) | |
| class BeforePosition(int, Position): | |
| """Specify a position where the actual location occurs before it. | |
| Arguments: | |
| - position - The upper boundary of where the location can occur. | |
| - extension - An optional argument which must be zero since we don't | |
| have an extension. The argument is provided so that the same number | |
| of arguments can be passed to all position types. | |
| This is used to specify positions like (<10..100) where the location | |
| occurs somewhere before position 10. | |
| >>> p = BeforePosition(5) | |
| >>> p | |
| BeforePosition(5) | |
| >>> print(p) | |
| <5 | |
| >>> int(p) | |
| 5 | |
| >>> p + 10 | |
| BeforePosition(15) | |
| Note this potentially surprising behavior: | |
| >>> p == ExactPosition(5) | |
| True | |
| >>> p == AfterPosition(5) | |
| True | |
| Just remember that for equality and sorting the position objects act | |
| like integers. | |
| """ | |
| # Subclasses int so can't use __init__ | |
| def __new__(cls, position, extension=0): | |
| """Create a new instance in BeforePosition object.""" | |
| if extension != 0: | |
| raise AttributeError(f"Non-zero extension {extension} for exact position.") | |
| return int.__new__(cls, position) | |
| def __repr__(self): | |
| """Represent the location as a string for debugging.""" | |
| return "%s(%i)" % (self.__class__.__name__, int(self)) | |
| def __str__(self): | |
| """Return a representation of the BeforePosition object (with python counting).""" | |
| return f"<{int(self)}" | |
| def __add__(self, offset): | |
| """Return a copy of the position object with its location shifted (PRIVATE).""" | |
| return self.__class__(int(self) + offset) | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
| return AfterPosition(length - int(self)) | |
| class AfterPosition(int, Position): | |
| """Specify a position where the actual location is found after it. | |
| Arguments: | |
| - position - The lower boundary of where the location can occur. | |
| - extension - An optional argument which must be zero since we don't | |
| have an extension. The argument is provided so that the same number | |
| of arguments can be passed to all position types. | |
| This is used to specify positions like (>10..100) where the location | |
| occurs somewhere after position 10. | |
| >>> p = AfterPosition(7) | |
| >>> p | |
| AfterPosition(7) | |
| >>> print(p) | |
| >7 | |
| >>> int(p) | |
| 7 | |
| >>> p + 10 | |
| AfterPosition(17) | |
| >>> isinstance(p, AfterPosition) | |
| True | |
| >>> isinstance(p, Position) | |
| True | |
| >>> isinstance(p, int) | |
| True | |
| Note this potentially surprising behavior: | |
| >>> p == ExactPosition(7) | |
| True | |
| >>> p == BeforePosition(7) | |
| True | |
| Just remember that for equality and sorting the position objects act | |
| like integers. | |
| """ | |
| # Subclasses int so can't use __init__ | |
| def __new__(cls, position, extension=0): | |
| """Create a new instance of the AfterPosition object.""" | |
| if extension != 0: | |
| raise AttributeError(f"Non-zero extension {extension} for exact position.") | |
| return int.__new__(cls, position) | |
| def __repr__(self): | |
| """Represent the location as a string for debugging.""" | |
| return "%s(%i)" % (self.__class__.__name__, int(self)) | |
| def __str__(self): | |
| """Return a representation of the AfterPosition object (with python counting).""" | |
| return f">{int(self)}" | |
| def __add__(self, offset): | |
| """Return a copy of the position object with its location shifted (PRIVATE).""" | |
| return self.__class__(int(self) + offset) | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
| return BeforePosition(length - int(self)) | |
| class OneOfPosition(int, Position): | |
| """Specify a position where the location can be multiple positions. | |
| This models the GenBank 'one-of(1888,1901)' function, and tries | |
| to make this fit within the Biopython Position models. If this was | |
| a start position it should act like 1888, but as an end position 1901. | |
| >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) | |
| >>> p | |
| OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) | |
| >>> int(p) | |
| 1888 | |
| Integer comparisons and operators act like using int(p), | |
| >>> p == 1888 | |
| True | |
| >>> p <= 1888 | |
| True | |
| >>> p > 1888 | |
| False | |
| >>> p + 100 | |
| OneOfPosition(1988, choices=[ExactPosition(1988), ExactPosition(2001)]) | |
| >>> isinstance(p, OneOfPosition) | |
| True | |
| >>> isinstance(p, Position) | |
| True | |
| >>> isinstance(p, int) | |
| True | |
| """ | |
| def __new__(cls, position, choices): | |
| """Initialize with a set of possible positions. | |
| choices is a list of Position derived objects, specifying possible | |
| locations. | |
| position is an integer specifying the default behavior. | |
| """ | |
| if position not in choices: | |
| raise ValueError( | |
| f"OneOfPosition: {position!r} should match one of {choices!r}" | |
| ) | |
| obj = int.__new__(cls, position) | |
| obj.position_choices = choices | |
| return obj | |
| def __getnewargs__(self): | |
| """Return the arguments accepted by __new__. | |
| Necessary to allow pickling and unpickling of class instances. | |
| """ | |
| return (int(self), self.position_choices) | |
| def position(self): | |
| """Legacy attribute to get (left) position as integer (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.position is deprecated and will be removed in a future release. " | |
| "Use location directly, or int(location) which will return the preferred location " | |
| "defined for a OneOfPosition (which may not be the left-most position), or " | |
| "min(location.position_choices) instead.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| return min(int(pos) for pos in self.position_choices) | |
| def extension(self): | |
| """Legacy attribute to get the one-of-position's 'width' as an integer (DEPRECATED).""" | |
| warnings.warn( | |
| "Alias location.extension is deprecated and will be removed in a future release. " | |
| "This is usually zero, but for a OneOfPosition you can use " | |
| "max(position.position_choices) - min(position.position_choices)", | |
| BiopythonDeprecationWarning, | |
| ) | |
| positions = [int(pos) for pos in self.position_choices] | |
| return max(positions) - min(positions) | |
| def __repr__(self): | |
| """Represent the OneOfPosition object as a string for debugging.""" | |
| return "%s(%i, choices=%r)" % ( | |
| self.__class__.__name__, | |
| int(self), | |
| self.position_choices, | |
| ) | |
| def __str__(self): | |
| """Return a representation of the OneOfPosition object (with python counting).""" | |
| out = "one-of(" | |
| for position in self.position_choices: | |
| out += f"{position}," | |
| # replace the last comma with the closing parenthesis | |
| return out[:-1] + ")" | |
| def __add__(self, offset): | |
| """Return a copy of the position object with its location shifted (PRIVATE).""" | |
| return self.__class__( | |
| int(self) + offset, [p + offset for p in self.position_choices] | |
| ) | |
| def _flip(self, length): | |
| """Return a copy of the location after the parent is reversed (PRIVATE).""" | |
| return self.__class__( | |
| length - int(self), [p._flip(length) for p in self.position_choices[::-1]] | |
| ) | |
| class PositionGap: | |
| """Simple class to hold information about a gap between positions (DEPRECATED).""" | |
| def __init__(self, gap_size): | |
| """Initialize with a position object containing the gap information.""" | |
| self.gap_size = gap_size | |
| warnings.warn( | |
| "The PositionGap class is deprecated and will be removed in a future release. " | |
| "It has not been used in Biopython for over ten years.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| def __repr__(self): | |
| """Represent the position gap as a string for debugging.""" | |
| return f"{self.__class__.__name__}({self.gap_size!r})" | |
| def __str__(self): | |
| """Return a representation of the PositionGap object (with python counting).""" | |
| return f"gap({self.gap_size})" | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |