Spaces:
No application file
No application file
| # Copyright 2000-2002 Andrew Dalke. All rights reserved. | |
| # Copyright 2002-2004 Brad Chapman. All rights reserved. | |
| # Copyright 2006-2020 by Peter Cock. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Represent a Sequence Record, a sequence with annotation.""" | |
| # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL | |
| # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes | |
| # need to be in sync (this is the BioSQL "Database SeqRecord"). | |
| from io import StringIO | |
| import numbers | |
| from Bio import StreamModeError | |
| from Bio.Seq import UndefinedSequenceError | |
| _NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." | |
| class _RestrictedDict(dict): | |
| """Dict which only allows sequences of given length as values (PRIVATE). | |
| This simple subclass of the Python dictionary is used in the SeqRecord | |
| object for holding per-letter-annotations. This class is intended to | |
| prevent simple errors by only allowing python sequences (e.g. lists, | |
| strings and tuples) to be stored, and only if their length matches that | |
| expected (the length of the SeqRecord's seq object). It cannot however | |
| prevent the entries being edited in situ (for example appending entries | |
| to a list). | |
| >>> x = _RestrictedDict(5) | |
| >>> x["test"] = "hello" | |
| >>> x | |
| {'test': 'hello'} | |
| Adding entries which don't have the expected length are blocked: | |
| >>> x["test"] = "hello world" | |
| Traceback (most recent call last): | |
| ... | |
| TypeError: We only allow python sequences (lists, tuples or strings) of length 5. | |
| The expected length is stored as a private attribute, | |
| >>> x._length | |
| 5 | |
| In order that the SeqRecord (and other objects using this class) can be | |
| pickled, for example for use in the multiprocessing library, we need to | |
| be able to pickle the restricted dictionary objects. | |
| Using the default protocol, which is 3 on Python 3, | |
| >>> import pickle | |
| >>> y = pickle.loads(pickle.dumps(x)) | |
| >>> y | |
| {'test': 'hello'} | |
| >>> y._length | |
| 5 | |
| Using the highest protocol, which is 4 on Python 3, | |
| >>> import pickle | |
| >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) | |
| >>> z | |
| {'test': 'hello'} | |
| >>> z._length | |
| 5 | |
| """ | |
| def __init__(self, length): | |
| """Create an EMPTY restricted dictionary.""" | |
| dict.__init__(self) | |
| self._length = int(length) | |
| def __setitem__(self, key, value): | |
| # The check hasattr(self, "_length") is to cope with pickle protocol 2 | |
| # I couldn't seem to avoid this with __getstate__ and __setstate__ | |
| if ( | |
| not hasattr(value, "__len__") | |
| or not hasattr(value, "__getitem__") | |
| or (hasattr(self, "_length") and len(value) != self._length) | |
| ): | |
| raise TypeError( | |
| "We only allow python sequences (lists, tuples or strings) " | |
| f"of length {self._length}." | |
| ) | |
| dict.__setitem__(self, key, value) | |
| def update(self, new_dict): | |
| # Force this to go via our strict __setitem__ method | |
| for (key, value) in new_dict.items(): | |
| self[key] = value | |
| class SeqRecord: | |
| """A SeqRecord object holds a sequence and information about it. | |
| Main attributes: | |
| - id - Identifier such as a locus tag (string) | |
| - seq - The sequence itself (Seq object or similar) | |
| Additional attributes: | |
| - name - Sequence name, e.g. gene name (string) | |
| - description - Additional text (string) | |
| - dbxrefs - List of database cross references (list of strings) | |
| - features - Any (sub)features defined (list of SeqFeature objects) | |
| - annotations - Further information about the whole sequence (dictionary). | |
| Most entries are strings, or lists of strings. | |
| - letter_annotations - Per letter/symbol annotation (restricted | |
| dictionary). This holds Python sequences (lists, strings | |
| or tuples) whose length matches that of the sequence. | |
| A typical use would be to hold a list of integers | |
| representing sequencing quality scores, or a string | |
| representing the secondary structure. | |
| You will typically use Bio.SeqIO to read in sequences from files as | |
| SeqRecord objects. However, you may want to create your own SeqRecord | |
| objects directly (see the __init__ method for further details): | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"), | |
| ... id="YP_025292.1", name="HokC", | |
| ... description="toxic membrane protein") | |
| >>> print(record) | |
| ID: YP_025292.1 | |
| Name: HokC | |
| Description: toxic membrane protein | |
| Number of features: 0 | |
| Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF') | |
| If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO | |
| for this. For the special case where you want the SeqRecord turned into | |
| a string in a particular file format there is a format method which uses | |
| Bio.SeqIO internally: | |
| >>> print(record.format("fasta")) | |
| >YP_025292.1 toxic membrane protein | |
| MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF | |
| <BLANKLINE> | |
| You can also do things like slicing a SeqRecord, checking its length, etc | |
| >>> len(record) | |
| 44 | |
| >>> edited = record[:10] + record[11:] | |
| >>> print(edited.seq) | |
| MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF | |
| >>> print(record.seq) | |
| MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF | |
| """ | |
| def __init__( | |
| self, | |
| seq, | |
| id="<unknown id>", | |
| name="<unknown name>", | |
| description="<unknown description>", | |
| dbxrefs=None, | |
| features=None, | |
| annotations=None, | |
| letter_annotations=None, | |
| ): | |
| """Create a SeqRecord. | |
| Arguments: | |
| - seq - Sequence, required (Seq or MutableSeq) | |
| - id - Sequence identifier, recommended (string) | |
| - name - Sequence name, optional (string) | |
| - description - Sequence description, optional (string) | |
| - dbxrefs - Database cross references, optional (list of strings) | |
| - features - Any (sub)features, optional (list of SeqFeature objects) | |
| - annotations - Dictionary of annotations for the whole sequence | |
| - letter_annotations - Dictionary of per-letter-annotations, values | |
| should be strings, list or tuples of the same length as the full | |
| sequence. | |
| You will typically use Bio.SeqIO to read in sequences from files as | |
| SeqRecord objects. However, you may want to create your own SeqRecord | |
| objects directly. | |
| Note that while an id is optional, we strongly recommend you supply a | |
| unique id string for each record. This is especially important | |
| if you wish to write your sequences to a file. | |
| You can create a 'blank' SeqRecord object, and then populate the | |
| attributes later. | |
| """ | |
| if id is not None and not isinstance(id, str): | |
| # Lots of existing code uses id=None... this may be a bad idea. | |
| raise TypeError("id argument should be a string") | |
| if not isinstance(name, str): | |
| raise TypeError("name argument should be a string") | |
| if not isinstance(description, str): | |
| raise TypeError("description argument should be a string") | |
| self._seq = seq | |
| self.id = id | |
| self.name = name | |
| self.description = description | |
| # database cross references (for the whole sequence) | |
| if dbxrefs is None: | |
| dbxrefs = [] | |
| elif not isinstance(dbxrefs, list): | |
| raise TypeError("dbxrefs argument should be a list (of strings)") | |
| self.dbxrefs = dbxrefs | |
| # annotations about the whole sequence | |
| if annotations is None: | |
| annotations = {} | |
| elif not isinstance(annotations, dict): | |
| raise TypeError("annotations argument must be a dict or None") | |
| self.annotations = annotations | |
| if letter_annotations is None: | |
| # annotations about each letter in the sequence | |
| if seq is None: | |
| # Should we allow this and use a normal unrestricted dict? | |
| self._per_letter_annotations = _RestrictedDict(length=0) | |
| else: | |
| try: | |
| self._per_letter_annotations = _RestrictedDict(length=len(seq)) | |
| except TypeError: | |
| raise TypeError( | |
| "seq argument should be a Seq object or similar" | |
| ) from None | |
| else: | |
| # This will be handled via the property set function, which will | |
| # turn this into a _RestrictedDict and thus ensure all the values | |
| # in the dict are the right length | |
| self.letter_annotations = letter_annotations | |
| # annotations about parts of the sequence | |
| if features is None: | |
| features = [] | |
| elif not isinstance(features, list): | |
| raise TypeError( | |
| "features argument should be a list (of SeqFeature objects)" | |
| ) | |
| self.features = features | |
| # TODO - Just make this a read only property? | |
| def _set_per_letter_annotations(self, value): | |
| if not isinstance(value, dict): | |
| raise TypeError( | |
| "The per-letter-annotations should be a (restricted) dictionary." | |
| ) | |
| # Turn this into a restricted-dictionary (and check the entries) | |
| try: | |
| self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) | |
| except AttributeError: | |
| # e.g. seq is None | |
| self._per_letter_annotations = _RestrictedDict(length=0) | |
| self._per_letter_annotations.update(value) | |
| letter_annotations = property( | |
| fget=lambda self: self._per_letter_annotations, | |
| fset=_set_per_letter_annotations, | |
| doc="""Dictionary of per-letter-annotation for the sequence. | |
| For example, this can hold quality scores used in FASTQ or QUAL files. | |
| Consider this example using Bio.SeqIO to read in an example Solexa | |
| variant FASTQ file as a SeqRecord: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") | |
| >>> print("%s %s" % (record.id, record.seq)) | |
| slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN | |
| >>> print(list(record.letter_annotations)) | |
| ['solexa_quality'] | |
| >>> print(record.letter_annotations["solexa_quality"]) | |
| [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] | |
| The letter_annotations get sliced automatically if you slice the | |
| parent SeqRecord, for example taking the last ten bases: | |
| >>> sub_record = record[-10:] | |
| >>> print("%s %s" % (sub_record.id, sub_record.seq)) | |
| slxa_0001_1_0001_01 ACGTNNNNNN | |
| >>> print(sub_record.letter_annotations["solexa_quality"]) | |
| [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] | |
| Any python sequence (i.e. list, tuple or string) can be recorded in | |
| the SeqRecord's letter_annotations dictionary as long as the length | |
| matches that of the SeqRecord's sequence. e.g. | |
| >>> len(sub_record.letter_annotations) | |
| 1 | |
| >>> sub_record.letter_annotations["dummy"] = "abcdefghij" | |
| >>> len(sub_record.letter_annotations) | |
| 2 | |
| You can delete entries from the letter_annotations dictionary as usual: | |
| >>> del sub_record.letter_annotations["solexa_quality"] | |
| >>> sub_record.letter_annotations | |
| {'dummy': 'abcdefghij'} | |
| You can completely clear the dictionary easily as follows: | |
| >>> sub_record.letter_annotations = {} | |
| >>> sub_record.letter_annotations | |
| {} | |
| Note that if replacing the record's sequence with a sequence of a | |
| different length you must first clear the letter_annotations dict. | |
| """, | |
| ) | |
| def _set_seq(self, value): | |
| # TODO - Add a deprecation warning that the seq should be write only? | |
| if self._per_letter_annotations: | |
| if len(self) != len(value): | |
| # TODO - Make this a warning? Silently empty the dictionary? | |
| raise ValueError("You must empty the letter annotations first!") | |
| else: | |
| # Leave the existing per letter annotations unchanged: | |
| self._seq = value | |
| else: | |
| self._seq = value | |
| # Reset the (empty) letter annotations dict with new length: | |
| try: | |
| self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) | |
| except AttributeError: | |
| # e.g. seq is None | |
| self._per_letter_annotations = _RestrictedDict(length=0) | |
| seq = property( | |
| fget=lambda self: self._seq, | |
| fset=_set_seq, | |
| doc="The sequence itself, as a Seq or MutableSeq object.", | |
| ) | |
| def __getitem__(self, index): | |
| """Return a sub-sequence or an individual letter. | |
| Slicing, e.g. my_record[5:10], returns a new SeqRecord for | |
| that sub-sequence with some annotation preserved as follows: | |
| * The name, id and description are kept as-is. | |
| * Any per-letter-annotations are sliced to match the requested | |
| sub-sequence. | |
| * Unless a stride is used, all those features which fall fully | |
| within the subsequence are included (with their locations | |
| adjusted accordingly). If you want to preserve any truncated | |
| features (e.g. GenBank/EMBL source features), you must | |
| explicitly add them to the new SeqRecord yourself. | |
| * With the exception of any molecule type, the annotations | |
| dictionary and the dbxrefs list are not used for the new | |
| SeqRecord, as in general they may not apply to the | |
| subsequence. If you want to preserve them, you must explicitly | |
| copy them to the new SeqRecord yourself. | |
| Using an integer index, e.g. my_record[5] is shorthand for | |
| extracting that letter from the sequence, my_record.seq[5]. | |
| For example, consider this short protein and its secondary | |
| structure as encoded by the PDB (e.g. H for alpha helices), | |
| plus a simple feature for its histidine self phosphorylation | |
| site: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> from Bio.SeqFeature import SeqFeature, SimpleLocation | |
| >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" | |
| ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR"), | |
| ... id="1JOY", name="EnvZ", | |
| ... description="Homodimeric domain of EnvZ from E. coli") | |
| >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " | |
| >>> rec.features.append(SeqFeature(SimpleLocation(20, 21), | |
| ... type = "Site")) | |
| Now let's have a quick look at the full record, | |
| >>> print(rec) | |
| ID: 1JOY | |
| Name: EnvZ | |
| Description: Homodimeric domain of EnvZ from E. coli | |
| Number of features: 1 | |
| Per letter annotation for: secondary_structure | |
| Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR') | |
| >>> rec.letter_annotations["secondary_structure"] | |
| ' S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT ' | |
| >>> print(rec.features[0].location) | |
| [20:21] | |
| Now let's take a sub sequence, here chosen as the first (fractured) | |
| alpha helix which includes the histidine phosphorylation site: | |
| >>> sub = rec[11:41] | |
| >>> print(sub) | |
| ID: 1JOY | |
| Name: EnvZ | |
| Description: Homodimeric domain of EnvZ from E. coli | |
| Number of features: 1 | |
| Per letter annotation for: secondary_structure | |
| Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD') | |
| >>> sub.letter_annotations["secondary_structure"] | |
| 'HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH' | |
| >>> print(sub.features[0].location) | |
| [9:10] | |
| You can also of course omit the start or end values, for | |
| example to get the first ten letters only: | |
| >>> print(rec[:10]) | |
| ID: 1JOY | |
| Name: EnvZ | |
| Description: Homodimeric domain of EnvZ from E. coli | |
| Number of features: 0 | |
| Per letter annotation for: secondary_structure | |
| Seq('MAAGVKQLAD') | |
| Or for the last ten letters: | |
| >>> print(rec[-10:]) | |
| ID: 1JOY | |
| Name: EnvZ | |
| Description: Homodimeric domain of EnvZ from E. coli | |
| Number of features: 0 | |
| Per letter annotation for: secondary_structure | |
| Seq('IIEQFIDYLR') | |
| If you omit both, then you get a copy of the original record (although | |
| lacking the annotations and dbxrefs): | |
| >>> print(rec[:]) | |
| ID: 1JOY | |
| Name: EnvZ | |
| Description: Homodimeric domain of EnvZ from E. coli | |
| Number of features: 1 | |
| Per letter annotation for: secondary_structure | |
| Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR') | |
| Finally, indexing with a simple integer is shorthand for pulling out | |
| that letter from the sequence directly: | |
| >>> rec[5] | |
| 'K' | |
| >>> rec.seq[5] | |
| 'K' | |
| """ | |
| if isinstance(index, numbers.Integral): | |
| # NOTE - The sequence level annotation like the id, name, etc | |
| # do not really apply to a single character. However, should | |
| # we try and expose any per-letter-annotation here? If so how? | |
| return self.seq[index] | |
| elif isinstance(index, slice): | |
| if self.seq is None: | |
| raise ValueError("If the sequence is None, we cannot slice it.") | |
| parent_length = len(self) | |
| try: | |
| from BioSQL.BioSeq import DBSeqRecord | |
| biosql_available = True | |
| except ImportError: | |
| biosql_available = False | |
| if biosql_available and isinstance(self, DBSeqRecord): | |
| answer = SeqRecord( | |
| self.seq[index], | |
| id=self.id, | |
| name=self.name, | |
| description=self.description, | |
| ) | |
| else: | |
| answer = self.__class__( | |
| self.seq[index], | |
| id=self.id, | |
| name=self.name, | |
| description=self.description, | |
| ) | |
| # TODO - The description may no longer apply. | |
| # It would be safer to change it to something | |
| # generic like "edited" or the default value. | |
| # Don't copy the annotation dict and dbxefs list, | |
| # they may not apply to a subsequence. | |
| # answer.annotations = dict(self.annotations.items()) | |
| # answer.dbxrefs = self.dbxrefs[:] | |
| # TODO - Review this in light of adding SeqRecord objects? | |
| if "molecule_type" in self.annotations: | |
| # This will still apply, and we need it for GenBank/EMBL etc output | |
| answer.annotations["molecule_type"] = self.annotations["molecule_type"] | |
| # TODO - Cope with strides by generating ambiguous locations? | |
| start, stop, step = index.indices(parent_length) | |
| if step == 1: | |
| # Select relevant features, add them with shifted locations | |
| # assert str(self.seq)[index] == str(self.seq)[start:stop] | |
| for f in self.features: | |
| if f.ref or f.ref_db: | |
| # TODO - Implement this (with lots of tests)? | |
| import warnings | |
| warnings.warn( | |
| "When slicing SeqRecord objects, any " | |
| "SeqFeature referencing other sequences (e.g. " | |
| "from segmented GenBank records) are ignored." | |
| ) | |
| continue | |
| try: | |
| if start <= f.location.start and f.location.end <= stop: | |
| answer.features.append(f._shift(-start)) | |
| except TypeError: | |
| # Will fail on UnknownPosition | |
| pass | |
| # Slice all the values to match the sliced sequence | |
| # (this should also work with strides, even negative strides): | |
| for key, value in self.letter_annotations.items(): | |
| answer._per_letter_annotations[key] = value[index] | |
| return answer | |
| raise ValueError("Invalid index") | |
| def __iter__(self): | |
| """Iterate over the letters in the sequence. | |
| For example, using Bio.SeqIO to read in a protein FASTA file: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") | |
| >>> for amino in record: | |
| ... print(amino) | |
| ... if amino == "L": break | |
| X | |
| A | |
| G | |
| L | |
| >>> print(record.seq[3]) | |
| L | |
| This is just a shortcut for iterating over the sequence directly: | |
| >>> for amino in record.seq: | |
| ... print(amino) | |
| ... if amino == "L": break | |
| X | |
| A | |
| G | |
| L | |
| >>> print(record.seq[3]) | |
| L | |
| Note that this does not facilitate iteration together with any | |
| per-letter-annotation. However, you can achieve that using the | |
| python zip function on the record (or its sequence) and the relevant | |
| per-letter-annotation: | |
| >>> from Bio import SeqIO | |
| >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") | |
| >>> print("%s %s" % (rec.id, rec.seq)) | |
| slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN | |
| >>> print(list(rec.letter_annotations)) | |
| ['solexa_quality'] | |
| >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): | |
| ... if qual > 35: | |
| ... print("%s %i" % (nuc, qual)) | |
| A 40 | |
| C 39 | |
| G 38 | |
| T 37 | |
| A 36 | |
| You may agree that using zip(rec.seq, ...) is more explicit than using | |
| zip(rec, ...) as shown above. | |
| """ | |
| return iter(self.seq) | |
| def __contains__(self, char): | |
| """Implement the 'in' keyword, searches the sequence. | |
| e.g. | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") | |
| >>> "GAATTC" in record | |
| False | |
| >>> "AAA" in record | |
| True | |
| This essentially acts as a proxy for using "in" on the sequence: | |
| >>> "GAATTC" in record.seq | |
| False | |
| >>> "AAA" in record.seq | |
| True | |
| Note that you can also use Seq objects as the query, | |
| >>> from Bio.Seq import Seq | |
| >>> Seq("AAA") in record | |
| True | |
| See also the Seq object's __contains__ method. | |
| """ | |
| return char in self.seq | |
| def __bytes__(self): | |
| return bytes(self.seq) | |
| def __str__(self): | |
| """Return a human readable summary of the record and its annotation (string). | |
| The python built in function str works by calling the object's __str__ | |
| method. e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"), | |
| ... id="YP_025292.1", name="HokC", | |
| ... description="toxic membrane protein, small") | |
| >>> print(str(record)) | |
| ID: YP_025292.1 | |
| Name: HokC | |
| Description: toxic membrane protein, small | |
| Number of features: 0 | |
| Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF') | |
| In this example you don't actually need to call str explicitly, as the | |
| print command does this automatically: | |
| >>> print(record) | |
| ID: YP_025292.1 | |
| Name: HokC | |
| Description: toxic membrane protein, small | |
| Number of features: 0 | |
| Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF') | |
| Note that long sequences are shown truncated. | |
| """ | |
| lines = [] | |
| if self.id: | |
| lines.append(f"ID: {self.id}") | |
| if self.name: | |
| lines.append(f"Name: {self.name}") | |
| if self.description: | |
| lines.append(f"Description: {self.description}") | |
| if self.dbxrefs: | |
| lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) | |
| lines.append(f"Number of features: {len(self.features)}") | |
| for a in self.annotations: | |
| lines.append(f"/{a}={str(self.annotations[a])}") | |
| if self.letter_annotations: | |
| lines.append( | |
| "Per letter annotation for: " + ", ".join(self.letter_annotations) | |
| ) | |
| try: | |
| bytes(self.seq) | |
| except UndefinedSequenceError: | |
| lines.append(f"Undefined sequence of length {len(self.seq)}") | |
| else: | |
| # Don't want to include the entire sequence | |
| seq = repr(self.seq) | |
| lines.append(seq) | |
| return "\n".join(lines) | |
| def __repr__(self): | |
| """Return a concise summary of the record for debugging (string). | |
| The python built in function repr works by calling the object's __repr__ | |
| method. e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" | |
| ... "GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" | |
| ... "SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" | |
| ... "PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF"), | |
| ... id="NP_418483.1", name="b4059", | |
| ... description="ssDNA-binding protein", | |
| ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) | |
| >>> print(repr(rec)) | |
| SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF'), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) | |
| At the python prompt you can also use this shorthand: | |
| >>> rec | |
| SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF'), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) | |
| Note that long sequences are shown truncated. Also note that any | |
| annotations, letter_annotations and features are not shown (as they | |
| would lead to a very long string). | |
| """ | |
| return ( | |
| f"{self.__class__.__name__}(seq={self.seq!r}, id={self.id!r}," | |
| f" name={self.name!r}, description={self.description!r}," | |
| f" dbxrefs={self.dbxrefs!r})" | |
| ) | |
| def format(self, format): | |
| r"""Return the record as a string in the specified file format. | |
| The format should be a lower case string supported as an output | |
| format by Bio.SeqIO, which is used to turn the SeqRecord into a | |
| string. e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"), | |
| ... id="YP_025292.1", name="HokC", | |
| ... description="toxic membrane protein") | |
| >>> record.format("fasta") | |
| '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' | |
| >>> print(record.format("fasta")) | |
| >YP_025292.1 toxic membrane protein | |
| MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF | |
| <BLANKLINE> | |
| The Python print function automatically appends a new line, meaning | |
| in this example a blank line is shown. If you look at the string | |
| representation you can see there is a trailing new line (shown as | |
| slash n) which is important when writing to a file or if | |
| concatenating multiple sequence strings together. | |
| Note that this method will NOT work on every possible file format | |
| supported by Bio.SeqIO (e.g. some are for multiple sequences only, | |
| and binary formats are not supported). | |
| """ | |
| # See also the __format__ method | |
| return self.__format__(format) | |
| def __format__(self, format_spec): | |
| r"""Return the record as a string in the specified file format. | |
| This method supports the Python format() function and f-strings. | |
| The format_spec should be a lower case string supported by | |
| Bio.SeqIO as a text output file format. Requesting a binary file | |
| format raises a ValueError. e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF"), | |
| ... id="YP_025292.1", name="HokC", | |
| ... description="toxic membrane protein") | |
| ... | |
| >>> format(record, "fasta") | |
| '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' | |
| >>> print(f"Here is {record.id} in FASTA format:\n{record:fasta}") | |
| Here is YP_025292.1 in FASTA format: | |
| >YP_025292.1 toxic membrane protein | |
| MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF | |
| <BLANKLINE> | |
| See also the SeqRecord's format() method. | |
| """ | |
| if not format_spec: | |
| # Follow python convention and default to using __str__ | |
| return str(self) | |
| from Bio import SeqIO | |
| # Easy case, can call string-building function directly | |
| if format_spec in SeqIO._FormatToString: | |
| return SeqIO._FormatToString[format_spec](self) | |
| # Harder case, make a temp handle instead | |
| handle = StringIO() | |
| try: | |
| SeqIO.write(self, handle, format_spec) | |
| except StreamModeError: | |
| raise ValueError( | |
| "Binary format %s cannot be used with SeqRecord format method" | |
| % format_spec | |
| ) from None | |
| return handle.getvalue() | |
| def __len__(self): | |
| """Return the length of the sequence. | |
| For example, using Bio.SeqIO to read in a FASTA nucleotide file: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") | |
| >>> len(record) | |
| 309 | |
| >>> len(record.seq) | |
| 309 | |
| """ | |
| return len(self.seq) | |
| def __lt__(self, other): | |
| """Define the less-than operand (not implemented).""" | |
| raise NotImplementedError(_NO_SEQRECORD_COMPARISON) | |
| def __le__(self, other): | |
| """Define the less-than-or-equal-to operand (not implemented).""" | |
| raise NotImplementedError(_NO_SEQRECORD_COMPARISON) | |
| def __eq__(self, other): | |
| """Define the equal-to operand (not implemented).""" | |
| raise NotImplementedError(_NO_SEQRECORD_COMPARISON) | |
| def __ne__(self, other): | |
| """Define the not-equal-to operand (not implemented).""" | |
| raise NotImplementedError(_NO_SEQRECORD_COMPARISON) | |
| def __gt__(self, other): | |
| """Define the greater-than operand (not implemented).""" | |
| raise NotImplementedError(_NO_SEQRECORD_COMPARISON) | |
| def __ge__(self, other): | |
| """Define the greater-than-or-equal-to operand (not implemented).""" | |
| raise NotImplementedError(_NO_SEQRECORD_COMPARISON) | |
| def __bool__(self): | |
| """Boolean value of an instance of this class (True). | |
| This behaviour is for backwards compatibility, since until the | |
| __len__ method was added, a SeqRecord always evaluated as True. | |
| Note that in comparison, a Seq object will evaluate to False if it | |
| has a zero length sequence. | |
| WARNING: The SeqRecord may in future evaluate to False when its | |
| sequence is of zero length (in order to better match the Seq | |
| object behaviour)! | |
| """ | |
| return True | |
| def __add__(self, other): | |
| """Add another sequence or string to this sequence. | |
| The other sequence can be a SeqRecord object, a Seq object (or | |
| similar, e.g. a MutableSeq) or a plain Python string. If you add | |
| a plain string or a Seq (like) object, the new SeqRecord will simply | |
| have this appended to the existing data. However, any per letter | |
| annotation will be lost: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") | |
| >>> print("%s %s" % (record.id, record.seq)) | |
| slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN | |
| >>> print(list(record.letter_annotations)) | |
| ['solexa_quality'] | |
| >>> new = record + "ACT" | |
| >>> print("%s %s" % (new.id, new.seq)) | |
| slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT | |
| >>> print(list(new.letter_annotations)) | |
| [] | |
| The new record will attempt to combine the annotation, but for any | |
| ambiguities (e.g. different names) it defaults to omitting that | |
| annotation. | |
| >>> from Bio import SeqIO | |
| >>> with open("GenBank/pBAD30.gb") as handle: | |
| ... plasmid = SeqIO.read(handle, "gb") | |
| >>> print("%s %i" % (plasmid.id, len(plasmid))) | |
| pBAD30 4923 | |
| Now let's cut the plasmid into two pieces, and join them back up the | |
| other way round (i.e. shift the starting point on this plasmid, have | |
| a look at the annotated features in the original file to see why this | |
| particular split point might make sense): | |
| >>> left = plasmid[:3765] | |
| >>> right = plasmid[3765:] | |
| >>> new = right + left | |
| >>> print("%s %i" % (new.id, len(new))) | |
| pBAD30 4923 | |
| >>> str(new.seq) == str(right.seq + left.seq) | |
| True | |
| >>> len(new.features) == len(left.features) + len(right.features) | |
| True | |
| When we add the left and right SeqRecord objects, their annotation | |
| is all consistent, so it is all conserved in the new SeqRecord: | |
| >>> new.id == left.id == right.id == plasmid.id | |
| True | |
| >>> new.name == left.name == right.name == plasmid.name | |
| True | |
| >>> new.description == plasmid.description | |
| True | |
| >>> new.annotations == left.annotations == right.annotations | |
| True | |
| >>> new.letter_annotations == plasmid.letter_annotations | |
| True | |
| >>> new.dbxrefs == left.dbxrefs == right.dbxrefs | |
| True | |
| However, we should point out that when we sliced the SeqRecord, | |
| any annotations dictionary or dbxrefs list entries were lost. | |
| You can explicitly copy them like this: | |
| >>> new.annotations = plasmid.annotations.copy() | |
| >>> new.dbxrefs = plasmid.dbxrefs[:] | |
| """ | |
| if not isinstance(other, SeqRecord): | |
| # Assume it is a string or a Seq. | |
| # Note can't transfer any per-letter-annotations | |
| return SeqRecord( | |
| self.seq + other, | |
| id=self.id, | |
| name=self.name, | |
| description=self.description, | |
| features=self.features[:], | |
| annotations=self.annotations.copy(), | |
| dbxrefs=self.dbxrefs[:], | |
| ) | |
| # Adding two SeqRecord objects... must merge annotation. | |
| answer = SeqRecord( | |
| self.seq + other.seq, features=self.features[:], dbxrefs=self.dbxrefs[:] | |
| ) | |
| # Will take all the features and all the db cross refs, | |
| length = len(self) | |
| for f in other.features: | |
| answer.features.append(f._shift(length)) | |
| del length | |
| for ref in other.dbxrefs: | |
| if ref not in answer.dbxrefs: | |
| answer.dbxrefs.append(ref) | |
| # Take common id/name/description/annotation | |
| if self.id == other.id: | |
| answer.id = self.id | |
| if self.name == other.name: | |
| answer.name = self.name | |
| if self.description == other.description: | |
| answer.description = self.description | |
| for k, v in self.annotations.items(): | |
| if k in other.annotations and other.annotations[k] == v: | |
| answer.annotations[k] = v | |
| # Can append matching per-letter-annotation | |
| for k, v in self.letter_annotations.items(): | |
| if k in other.letter_annotations: | |
| answer.letter_annotations[k] = v + other.letter_annotations[k] | |
| return answer | |
| def __radd__(self, other): | |
| """Add another sequence or string to this sequence (from the left). | |
| This method handles adding a Seq object (or similar, e.g. MutableSeq) | |
| or a plain Python string (on the left) to a SeqRecord (on the right). | |
| See the __add__ method for more details, but for example: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") | |
| >>> print("%s %s" % (record.id, record.seq)) | |
| slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN | |
| >>> print(list(record.letter_annotations)) | |
| ['solexa_quality'] | |
| >>> new = "ACT" + record | |
| >>> print("%s %s" % (new.id, new.seq)) | |
| slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN | |
| >>> print(list(new.letter_annotations)) | |
| [] | |
| """ | |
| if isinstance(other, SeqRecord): | |
| raise RuntimeError( | |
| "This should have happened via the __add__ of " | |
| "the other SeqRecord being added!" | |
| ) | |
| # Assume it is a string or a Seq. | |
| # Note can't transfer any per-letter-annotations | |
| offset = len(other) | |
| return SeqRecord( | |
| other + self.seq, | |
| id=self.id, | |
| name=self.name, | |
| description=self.description, | |
| features=[f._shift(offset) for f in self.features], | |
| annotations=self.annotations.copy(), | |
| dbxrefs=self.dbxrefs[:], | |
| ) | |
| def count(self, sub, start=None, end=None): | |
| """Return the number of non-overlapping occurrences of sub in seq[start:end]. | |
| Optional arguments start and end are interpreted as in slice notation. | |
| This method behaves as the count method of Python strings. | |
| """ | |
| return self.seq.count(sub, start, end) | |
| def upper(self): | |
| """Return a copy of the record with an upper case sequence. | |
| All the annotation is preserved unchanged. e.g. | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> record = SeqRecord(Seq("acgtACGT"), id="Test", | |
| ... description = "Made up for this example") | |
| >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] | |
| >>> print(record.upper().format("fastq")) | |
| @Test Made up for this example | |
| ACGTACGT | |
| + | |
| "#$%&'() | |
| <BLANKLINE> | |
| Naturally, there is a matching lower method: | |
| >>> print(record.lower().format("fastq")) | |
| @Test Made up for this example | |
| acgtacgt | |
| + | |
| "#$%&'() | |
| <BLANKLINE> | |
| """ | |
| return SeqRecord( | |
| self.seq.upper(), | |
| id=self.id, | |
| name=self.name, | |
| description=self.description, | |
| dbxrefs=self.dbxrefs[:], | |
| features=self.features[:], | |
| annotations=self.annotations.copy(), | |
| letter_annotations=self.letter_annotations.copy(), | |
| ) | |
| def lower(self): | |
| """Return a copy of the record with a lower case sequence. | |
| All the annotation is preserved unchanged. e.g. | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Fasta/aster.pro", "fasta") | |
| >>> print(record.format("fasta")) | |
| >gi|3298468|dbj|BAA31520.1| SAMIPF | |
| GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG | |
| VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI | |
| <BLANKLINE> | |
| >>> print(record.lower().format("fasta")) | |
| >gi|3298468|dbj|BAA31520.1| SAMIPF | |
| gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg | |
| vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani | |
| <BLANKLINE> | |
| To take a more annotation rich example, | |
| >>> from Bio import SeqIO | |
| >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") | |
| >>> len(old.features) | |
| 3 | |
| >>> new = old.lower() | |
| >>> len(old.features) == len(new.features) | |
| True | |
| >>> old.annotations["organism"] == new.annotations["organism"] | |
| True | |
| >>> old.dbxrefs == new.dbxrefs | |
| True | |
| """ | |
| return SeqRecord( | |
| self.seq.lower(), | |
| id=self.id, | |
| name=self.name, | |
| description=self.description, | |
| dbxrefs=self.dbxrefs[:], | |
| features=self.features[:], | |
| annotations=self.annotations.copy(), | |
| letter_annotations=self.letter_annotations.copy(), | |
| ) | |
| def isupper(self): | |
| """Return True if all ASCII characters in the record's sequence are uppercase. | |
| If there are no cased characters, the method returns False. | |
| """ | |
| return self.seq.isupper() | |
| def islower(self): | |
| """Return True if all ASCII characters in the record's sequence are lowercase. | |
| If there are no cased characters, the method returns False. | |
| """ | |
| return self.seq.islower() | |
| def reverse_complement( | |
| self, | |
| id=False, | |
| name=False, | |
| description=False, | |
| features=True, | |
| annotations=False, | |
| letter_annotations=True, | |
| dbxrefs=False, | |
| ): | |
| """Return new SeqRecord with reverse complement sequence. | |
| By default the new record does NOT preserve the sequence identifier, | |
| name, description, general annotation or database cross-references - | |
| these are unlikely to apply to the reversed sequence. | |
| You can specify the returned record's id, name and description as | |
| strings, or True to keep that of the parent, or False for a default. | |
| You can specify the returned record's features with a list of | |
| SeqFeature objects, or True to keep that of the parent, or False to | |
| omit them. The default is to keep the original features (with the | |
| strand and locations adjusted). | |
| You can also specify both the returned record's annotations and | |
| letter_annotations as dictionaries, True to keep that of the parent, | |
| or False to omit them. The default is to keep the original | |
| annotations (with the letter annotations reversed). | |
| To show what happens to the pre-letter annotations, consider an | |
| example Solexa variant FASTQ file with a single entry, which we'll | |
| read in as a SeqRecord: | |
| >>> from Bio import SeqIO | |
| >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") | |
| >>> print("%s %s" % (record.id, record.seq)) | |
| slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN | |
| >>> print(list(record.letter_annotations)) | |
| ['solexa_quality'] | |
| >>> print(record.letter_annotations["solexa_quality"]) | |
| [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] | |
| Now take the reverse complement, here we explicitly give a new | |
| identifier (the old identifier with a suffix): | |
| >>> rc_record = record.reverse_complement(id=record.id + "_rc") | |
| >>> print("%s %s" % (rc_record.id, rc_record.seq)) | |
| slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT | |
| Notice that the per-letter-annotations have also been reversed, | |
| although this may not be appropriate for all cases. | |
| >>> print(rc_record.letter_annotations["solexa_quality"]) | |
| [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] | |
| Now for the features, we need a different example. Parsing a GenBank | |
| file is probably the easiest way to get an nice example with features | |
| in it... | |
| >>> from Bio import SeqIO | |
| >>> with open("GenBank/pBAD30.gb") as handle: | |
| ... plasmid = SeqIO.read(handle, "gb") | |
| >>> print("%s %i" % (plasmid.id, len(plasmid))) | |
| pBAD30 4923 | |
| >>> plasmid.seq | |
| Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG') | |
| >>> len(plasmid.features) | |
| 13 | |
| Now, let's take the reverse complement of this whole plasmid: | |
| >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") | |
| >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) | |
| pBAD30_rc 4923 | |
| >>> rc_plasmid.seq | |
| Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC') | |
| >>> len(rc_plasmid.features) | |
| 13 | |
| Let's compare the first CDS feature - it has gone from being the | |
| second feature (index 1) to the second last feature (index -2), its | |
| strand has changed, and the location switched round. | |
| >>> print(plasmid.features[1]) | |
| type: CDS | |
| location: [1081:1960](-) | |
| qualifiers: | |
| Key: label, Value: ['araC'] | |
| Key: note, Value: ['araC regulator of the arabinose BAD promoter'] | |
| Key: vntifkey, Value: ['4'] | |
| <BLANKLINE> | |
| >>> print(rc_plasmid.features[-2]) | |
| type: CDS | |
| location: [2963:3842](+) | |
| qualifiers: | |
| Key: label, Value: ['araC'] | |
| Key: note, Value: ['araC regulator of the arabinose BAD promoter'] | |
| Key: vntifkey, Value: ['4'] | |
| <BLANKLINE> | |
| You can check this new location, based on the length of the plasmid: | |
| >>> len(plasmid) - 1081 | |
| 3842 | |
| >>> len(plasmid) - 1960 | |
| 2963 | |
| Note that if the SeqFeature annotation includes any strand specific | |
| information (e.g. base changes for a SNP), this information is not | |
| amended, and would need correction after the reverse complement. | |
| Note trying to reverse complement a protein SeqRecord raises an | |
| exception: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> protein_rec = SeqRecord(Seq("MAIVMGR"), id="Test", | |
| ... annotations={"molecule_type": "protein"}) | |
| >>> protein_rec.reverse_complement() | |
| Traceback (most recent call last): | |
| ... | |
| ValueError: Proteins do not have complements! | |
| If you have RNA without any U bases, it must be annotated as RNA | |
| otherwise it will be treated as DNA by default with A mapped to T: | |
| >>> from Bio.Seq import Seq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> rna1 = SeqRecord(Seq("ACG"), id="Test") | |
| >>> rna2 = SeqRecord(Seq("ACG"), id="Test", annotations={"molecule_type": "RNA"}) | |
| >>> print(rna1.reverse_complement(id="RC", description="unk").format("fasta")) | |
| >RC unk | |
| CGT | |
| <BLANKLINE> | |
| >>> print(rna2.reverse_complement(id="RC", description="RNA").format("fasta")) | |
| >RC RNA | |
| CGU | |
| <BLANKLINE> | |
| Also note you can reverse complement a SeqRecord using a MutableSeq: | |
| >>> from Bio.Seq import MutableSeq | |
| >>> from Bio.SeqRecord import SeqRecord | |
| >>> rec = SeqRecord(MutableSeq("ACGT"), id="Test") | |
| >>> rec.seq[0] = "T" | |
| >>> print("%s %s" % (rec.id, rec.seq)) | |
| Test TCGT | |
| >>> rc = rec.reverse_complement(id=True) | |
| >>> print("%s %s" % (rc.id, rc.seq)) | |
| Test ACGA | |
| """ | |
| from Bio.Seq import Seq, MutableSeq # Lazy to avoid circular imports | |
| if "protein" in self.annotations.get("molecule_type", ""): | |
| raise ValueError("Proteins do not have complements!") | |
| if "RNA" in self.annotations.get("molecule_type", ""): | |
| seq = self.seq.reverse_complement_rna( | |
| inplace=False | |
| ) # TODO: remove inplace=False | |
| else: | |
| # Default to DNA) | |
| seq = self.seq.reverse_complement( | |
| inplace=False | |
| ) # TODO: remove inplace=False | |
| if isinstance(self.seq, MutableSeq): | |
| seq = Seq(seq) | |
| answer = SeqRecord(seq) | |
| if isinstance(id, str): | |
| answer.id = id | |
| elif id: | |
| answer.id = self.id | |
| if isinstance(name, str): | |
| answer.name = name | |
| elif name: | |
| answer.name = self.name | |
| if isinstance(description, str): | |
| answer.description = description | |
| elif description: | |
| answer.description = self.description | |
| if isinstance(dbxrefs, list): | |
| answer.dbxrefs = dbxrefs | |
| elif dbxrefs: | |
| # Copy the old dbxrefs | |
| answer.dbxrefs = self.dbxrefs[:] | |
| if isinstance(features, list): | |
| answer.features = features | |
| elif features: | |
| # Copy the old features, adjusting location and string | |
| length = len(answer) | |
| answer.features = [f._flip(length) for f in self.features] | |
| # The old list should have been sorted by start location, | |
| # reversing it will leave it sorted by what is now the end position, | |
| # so we need to resort in case of overlapping features. | |
| # NOTE - In the common case of gene before CDS (and similar) with | |
| # the exact same locations, this will still maintain gene before CDS | |
| def key_fun(f): | |
| """Sort on start position.""" | |
| try: | |
| return int(f.location.start) | |
| except TypeError: # Expected for UnknownPosition | |
| return None | |
| answer.features.sort(key=key_fun) | |
| if isinstance(annotations, dict): | |
| answer.annotations = annotations | |
| elif annotations: | |
| # Copy the old annotations, | |
| answer.annotations = self.annotations.copy() | |
| if isinstance(letter_annotations, dict): | |
| answer.letter_annotations = letter_annotations | |
| elif letter_annotations: | |
| # Copy the old per letter annotations, reversing them | |
| for key, value in self.letter_annotations.items(): | |
| answer._per_letter_annotations[key] = value[::-1] | |
| return answer | |
| def translate( | |
| self, | |
| # Seq translation arguments: | |
| table="Standard", | |
| stop_symbol="*", | |
| to_stop=False, | |
| cds=False, | |
| gap=None, | |
| # SeqRecord annotation arguments: | |
| id=False, | |
| name=False, | |
| description=False, | |
| features=False, | |
| annotations=False, | |
| letter_annotations=False, | |
| dbxrefs=False, | |
| ): | |
| """Return new SeqRecord with translated sequence. | |
| This calls the record's .seq.translate() method (which describes | |
| the translation related arguments, like table for the genetic code), | |
| By default the new record does NOT preserve the sequence identifier, | |
| name, description, general annotation or database cross-references - | |
| these are unlikely to apply to the translated sequence. | |
| You can specify the returned record's id, name and description as | |
| strings, or True to keep that of the parent, or False for a default. | |
| You can specify the returned record's features with a list of | |
| SeqFeature objects, or False (default) to omit them. | |
| You can also specify both the returned record's annotations and | |
| letter_annotations as dictionaries, True to keep that of the parent | |
| (annotations only), or False (default) to omit them. | |
| e.g. Loading a FASTA gene and translating it, | |
| >>> from Bio import SeqIO | |
| >>> gene_record = SeqIO.read("Fasta/sweetpea.nu", "fasta") | |
| >>> print(gene_record.format("fasta")) | |
| >gi|3176602|gb|U78617.1|LOU78617 Lathyrus odoratus phytochrome A (PHYA) gene, partial cds | |
| CAGGCTGCGCGGTTTCTATTTATGAAGAACAAGGTCCGTATGATAGTTGATTGTCATGCA | |
| AAACATGTGAAGGTTCTTCAAGACGAAAAACTCCCATTTGATTTGACTCTGTGCGGTTCG | |
| ACCTTAAGAGCTCCACATAGTTGCCATTTGCAGTACATGGCTAACATGGATTCAATTGCT | |
| TCATTGGTTATGGCAGTGGTCGTCAATGACAGCGATGAAGATGGAGATAGCCGTGACGCA | |
| GTTCTACCACAAAAGAAAAAGAGACTTTGGGGTTTGGTAGTTTGTCATAACACTACTCCG | |
| AGGTTTGTT | |
| <BLANKLINE> | |
| And now translating the record, specifying the new ID and description: | |
| >>> protein_record = gene_record.translate(table=11, | |
| ... id="phya", | |
| ... description="translation") | |
| >>> print(protein_record.format("fasta")) | |
| >phya translation | |
| QAARFLFMKNKVRMIVDCHAKHVKVLQDEKLPFDLTLCGSTLRAPHSCHLQYMANMDSIA | |
| SLVMAVVVNDSDEDGDSRDAVLPQKKKRLWGLVVCHNTTPRFV | |
| <BLANKLINE> | |
| """ | |
| if "protein" == self.annotations.get("molecule_type", ""): | |
| raise ValueError("Proteins cannot be translated!") | |
| answer = SeqRecord( | |
| self.seq.translate( | |
| table=table, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap | |
| ) | |
| ) | |
| if isinstance(id, str): | |
| answer.id = id | |
| elif id: | |
| answer.id = self.id | |
| if isinstance(name, str): | |
| answer.name = name | |
| elif name: | |
| answer.name = self.name | |
| if isinstance(description, str): | |
| answer.description = description | |
| elif description: | |
| answer.description = self.description | |
| if isinstance(dbxrefs, list): | |
| answer.dbxrefs = dbxrefs | |
| elif dbxrefs: | |
| # Copy the old dbxrefs | |
| answer.dbxrefs = self.dbxrefs[:] | |
| if isinstance(features, list): | |
| answer.features = features | |
| elif features: | |
| # Does not make sense to copy old features as locations wrong | |
| raise TypeError(f"Unexpected features argument {features!r}") | |
| if isinstance(annotations, dict): | |
| answer.annotations = annotations | |
| elif annotations: | |
| # Copy the old annotations | |
| answer.annotations = self.annotations.copy() | |
| # Set/update to protein: | |
| answer.annotations["molecule_type"] = "protein" | |
| if isinstance(letter_annotations, dict): | |
| answer.letter_annotations = letter_annotations | |
| elif letter_annotations: | |
| # Does not make sense to copy these as length now wrong | |
| raise TypeError( | |
| f"Unexpected letter_annotations argument {letter_annotations!r}" | |
| ) | |
| return answer | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |