Spaces:
No application file
No application file
| #!/usr/bin/env python | |
| # Copyright 2002 by Thomas Sicheritz-Ponten and Cecilia Alsmark. | |
| # Copyright 2003 Yair Benita. | |
| # Revisions copyright 2014 by Markus Piotrowski. | |
| # Revisions copyright 2014-2016 by Peter Cock. | |
| # All rights reserved. | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Miscellaneous functions for dealing with sequences.""" | |
| import re | |
| import warnings | |
| from math import pi, sin, cos, log, exp | |
| from Bio.Seq import Seq, complement, complement_rna | |
| from Bio.Data import IUPACData | |
| from Bio.Data.CodonTable import standard_dna_table | |
| from Bio import BiopythonDeprecationWarning | |
| ###################################### | |
| # DNA | |
| ###################### | |
| # { | |
| _gc_values = { | |
| "G": 1.000, | |
| "C": 1.000, | |
| "A": 0.000, | |
| "T": 0.000, | |
| "S": 1.000, # Strong interaction (3 H bonds) (G or C) | |
| "W": 0.000, # Weak interaction (2 H bonds) (A or T) | |
| "M": 0.500, # Amino (A or C) | |
| "R": 0.500, # Purine (A or G) | |
| "Y": 0.500, # Pyrimidine (T or C) | |
| "K": 0.500, # Keto (G or T) | |
| "V": 2 / 3, # Not T or U (A or C or G) | |
| "B": 2 / 3, # Not A (C or G or T) | |
| "H": 1 / 3, # Not G (A or C or T) | |
| "D": 1 / 3, # Not C (A or G or T) | |
| "X": 0.500, # Any nucleotide (A or C or G or T) | |
| "N": 0.500, # Any nucleotide (A or C or G or T) | |
| } | |
| def gc_fraction(seq, ambiguous="remove"): | |
| """Calculate G+C percentage in seq (float between 0 and 1). | |
| Copes with mixed case sequences. Ambiguous Nucleotides in this context are | |
| those different from ATCGSW (S is G or C, and W is A or T). | |
| If ambiguous equals "remove" (default), will only count GCS and will only | |
| include ACTGSW when calculating the sequence length. Equivalent to removing | |
| all characters in the set BDHKMNRVXY before calculating the GC content, as | |
| each of these ambiguous nucleotides can either be in (A,T) or in (C,G). | |
| If ambiguous equals "ignore", it will treat only unambiguous nucleotides (GCS) | |
| as counting towards the GC percentage, but will include all ambiguous and | |
| unambiguous nucleotides when calculating the sequence length. | |
| If ambiguous equals "weighted", will use a "mean" value when counting the | |
| ambiguous characters, for example, G and C will be counted as 1, N and X will | |
| be counted as 0.5, D will be counted as 0.33 etc. See Bio.SeqUtils._gc_values | |
| for a full list. | |
| Will raise a ValueError for any other value of the ambiguous parameter. | |
| >>> from Bio.SeqUtils import gc_fraction | |
| >>> seq = "ACTG" | |
| >>> print(f"GC content of {seq} : {gc_fraction(seq):.2f}") | |
| GC content of ACTG : 0.50 | |
| S and W are ambiguous for the purposes of calculating the GC content. | |
| >>> seq = "ACTGSSSS" | |
| >>> gc = gc_fraction(seq, "remove") | |
| >>> print(f"GC content of {seq} : {gc:.2f}") | |
| GC content of ACTGSSSS : 0.75 | |
| >>> gc = gc_fraction(seq, "ignore") | |
| >>> print(f"GC content of {seq} : {gc:.2f}") | |
| GC content of ACTGSSSS : 0.75 | |
| >>> gc = gc_fraction(seq, "weighted") | |
| >>> print(f"GC content with ambiguous counting: {gc:.2f}") | |
| GC content with ambiguous counting: 0.75 | |
| Some examples with ambiguous nucleotides. | |
| >>> seq = "ACTGN" | |
| >>> gc = gc_fraction(seq, "ignore") | |
| >>> print(f"GC content of {seq} : {gc:.2f}") | |
| GC content of ACTGN : 0.40 | |
| >>> gc = gc_fraction(seq, "weighted") | |
| >>> print(f"GC content with ambiguous counting: {gc:.2f}") | |
| GC content with ambiguous counting: 0.50 | |
| >>> gc = gc_fraction(seq, "remove") | |
| >>> print(f"GC content with ambiguous removing: {gc:.2f}") | |
| GC content with ambiguous removing: 0.50 | |
| Ambiguous nucleotides are also removed from the length of the sequence. | |
| >>> seq = "GDVV" | |
| >>> gc = gc_fraction(seq, "ignore") | |
| >>> print(f"GC content of {seq} : {gc:.2f}") | |
| GC content of GDVV : 0.25 | |
| >>> gc = gc_fraction(seq, "weighted") | |
| >>> print(f"GC content with ambiguous counting: {gc:.4f}") | |
| GC content with ambiguous counting: 0.6667 | |
| >>> gc = gc_fraction(seq, "remove") | |
| >>> print(f"GC content with ambiguous removing: {gc:.2f}") | |
| GC content with ambiguous removing: 1.00 | |
| Note that this will return zero for an empty sequence. | |
| """ | |
| if ambiguous not in ("weighted", "remove", "ignore"): | |
| raise ValueError(f"ambiguous value '{ambiguous}' not recognized") | |
| gc = sum(seq.count(x) for x in "CGScgs") | |
| if ambiguous == "remove": | |
| length = gc + sum(seq.count(x) for x in "ATWatw") | |
| else: | |
| length = len(seq) | |
| if ambiguous == "weighted": | |
| gc += sum( | |
| (seq.count(x) + seq.count(x.lower())) * _gc_values[x] for x in "BDHKMNRVXY" | |
| ) | |
| if length == 0: | |
| return 0 | |
| return gc / length | |
| def GC(seq): | |
| """Calculate G+C content (DEPRECATED). | |
| Use Bio.SeqUtils.gc_fraction instead. | |
| """ | |
| warnings.warn( | |
| "GC is deprecated; please use gc_fraction instead.", | |
| BiopythonDeprecationWarning, | |
| ) | |
| gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"]) | |
| try: | |
| return gc * 100.0 / len(seq) | |
| except ZeroDivisionError: | |
| return 0.0 | |
| def GC123(seq): | |
| """Calculate G+C content: total, for first, second and third positions. | |
| Returns a tuple of four floats (percentages between 0 and 100) for the | |
| entire sequence, and the three codon positions. e.g. | |
| >>> from Bio.SeqUtils import GC123 | |
| >>> GC123("ACTGTN") | |
| (40.0, 50.0, 50.0, 0.0) | |
| Copes with mixed case sequences, but does NOT deal with ambiguous | |
| nucleotides. | |
| """ | |
| d = {} | |
| for nt in ["A", "T", "G", "C"]: | |
| d[nt] = [0, 0, 0] | |
| for i in range(0, len(seq), 3): | |
| codon = seq[i : i + 3] | |
| if len(codon) < 3: | |
| codon += " " | |
| for pos in range(0, 3): | |
| for nt in ["A", "T", "G", "C"]: | |
| if codon[pos] == nt or codon[pos] == nt.lower(): | |
| d[nt][pos] += 1 | |
| gc = {} | |
| gcall = 0 | |
| nall = 0 | |
| for i in range(0, 3): | |
| try: | |
| n = d["G"][i] + d["C"][i] + d["T"][i] + d["A"][i] | |
| gc[i] = (d["G"][i] + d["C"][i]) * 100.0 / n | |
| except Exception: # TODO - ValueError? | |
| gc[i] = 0 | |
| gcall = gcall + d["G"][i] + d["C"][i] | |
| nall = nall + n | |
| gcall = 100.0 * gcall / nall | |
| return gcall, gc[0], gc[1], gc[2] | |
| def GC_skew(seq, window=100): | |
| """Calculate GC skew (G-C)/(G+C) for multiple windows along the sequence. | |
| Returns a list of ratios (floats), controlled by the length of the sequence | |
| and the size of the window. | |
| Returns 0 for windows without any G/C by handling zero division errors. | |
| Does NOT look at any ambiguous nucleotides. | |
| """ | |
| # 8/19/03: Iddo: added lowercase | |
| values = [] | |
| for i in range(0, len(seq), window): | |
| s = seq[i : i + window] | |
| g = s.count("G") + s.count("g") | |
| c = s.count("C") + s.count("c") | |
| try: | |
| skew = (g - c) / (g + c) | |
| except ZeroDivisionError: | |
| skew = 0.0 | |
| values.append(skew) | |
| return values | |
| def xGC_skew(seq, window=1000, zoom=100, r=300, px=100, py=100): | |
| """Calculate and plot normal and accumulated GC skew (GRAPHICS !!!).""" | |
| import tkinter | |
| yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) | |
| xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) | |
| canvas = tkinter.Canvas( | |
| yscrollcommand=yscroll.set, xscrollcommand=xscroll.set, background="white" | |
| ) | |
| win = canvas.winfo_toplevel() | |
| win.geometry("700x700") | |
| yscroll.config(command=canvas.yview) | |
| xscroll.config(command=canvas.xview) | |
| yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) | |
| xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) | |
| canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) | |
| canvas.update() | |
| X0, Y0 = r + px, r + py | |
| x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r | |
| ty = Y0 | |
| canvas.create_text(X0, ty, text="%s...%s (%d nt)" % (seq[:7], seq[-7:], len(seq))) | |
| ty += 20 | |
| canvas.create_text(X0, ty, text=f"GC {GC(seq):3.2f}%") | |
| ty += 20 | |
| canvas.create_text(X0, ty, text="GC Skew", fill="blue") | |
| ty += 20 | |
| canvas.create_text(X0, ty, text="Accumulated GC Skew", fill="magenta") | |
| ty += 20 | |
| canvas.create_oval(x1, y1, x2, y2) | |
| acc = 0 | |
| start = 0 | |
| for gc in GC_skew(seq, window): | |
| r1 = r | |
| acc += gc | |
| # GC skew | |
| alpha = pi - (2 * pi * start) / len(seq) | |
| r2 = r1 - gc * zoom | |
| x1 = X0 + r1 * sin(alpha) | |
| y1 = Y0 + r1 * cos(alpha) | |
| x2 = X0 + r2 * sin(alpha) | |
| y2 = Y0 + r2 * cos(alpha) | |
| canvas.create_line(x1, y1, x2, y2, fill="blue") | |
| # accumulated GC skew | |
| r1 = r - 50 | |
| r2 = r1 - acc | |
| x1 = X0 + r1 * sin(alpha) | |
| y1 = Y0 + r1 * cos(alpha) | |
| x2 = X0 + r2 * sin(alpha) | |
| y2 = Y0 + r2 * cos(alpha) | |
| canvas.create_line(x1, y1, x2, y2, fill="magenta") | |
| canvas.update() | |
| start += window | |
| canvas.configure(scrollregion=canvas.bbox(tkinter.ALL)) | |
| def nt_search(seq, subseq): | |
| """Search for a DNA subseq in seq, return list of [subseq, positions]. | |
| Use ambiguous values (like N = A or T or C or G, R = A or G etc.), | |
| searches only on forward strand. | |
| """ | |
| pattern = "" | |
| for nt in subseq: | |
| value = IUPACData.ambiguous_dna_values[nt] | |
| if len(value) == 1: | |
| pattern += value | |
| else: | |
| pattern += f"[{value}]" | |
| pos = -1 | |
| result = [pattern] | |
| while True: | |
| pos += 1 | |
| s = seq[pos:] | |
| m = re.search(pattern, s) | |
| if not m: | |
| break | |
| pos += int(m.start(0)) | |
| result.append(pos) | |
| return result | |
| ###################################### | |
| # Protein | |
| ###################### | |
| def seq3(seq, custom_map=None, undef_code="Xaa"): | |
| """Convert protein sequence from one-letter to three-letter code. | |
| The single required input argument 'seq' should be a protein sequence using | |
| single letter codes, either as a Python string or as a Seq or MutableSeq | |
| object. | |
| This function returns the amino acid sequence as a string using the three | |
| letter amino acid codes. Output follows the IUPAC standard (including | |
| ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U | |
| for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an | |
| asterisk. Any unknown character (including possible gap characters), | |
| is changed into 'Xaa' by default. | |
| e.g. | |
| >>> from Bio.SeqUtils import seq3 | |
| >>> seq3("MAIVMGRWKGAR*") | |
| 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' | |
| You can set a custom translation of the codon termination code using the | |
| dictionary "custom_map" argument (which defaults to {'*': 'Ter'}), e.g. | |
| >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) | |
| 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' | |
| You can also set a custom translation for non-amino acid characters, such | |
| as '-', using the "undef_code" argument, e.g. | |
| >>> seq3("MAIVMGRWKGA--R*", undef_code='---') | |
| 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' | |
| If not given, "undef_code" defaults to "Xaa", e.g. | |
| >>> seq3("MAIVMGRWKGA--R*") | |
| 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' | |
| This function was inspired by BioPerl's seq3. | |
| """ | |
| if custom_map is None: | |
| custom_map = {"*": "Ter"} | |
| # not doing .update() on IUPACData dict with custom_map dict | |
| # to preserve its initial state (may be imported in other modules) | |
| threecode = dict( | |
| list(IUPACData.protein_letters_1to3_extended.items()) + list(custom_map.items()) | |
| ) | |
| # We use a default of 'Xaa' for undefined letters | |
| # Note this will map '-' to 'Xaa' which may be undesirable! | |
| return "".join(threecode.get(aa, undef_code) for aa in seq) | |
| def seq1(seq, custom_map=None, undef_code="X"): | |
| """Convert protein sequence from three-letter to one-letter code. | |
| The single required input argument 'seq' should be a protein sequence | |
| using three-letter codes, either as a Python string or as a Seq or | |
| MutableSeq object. | |
| This function returns the amino acid sequence as a string using the one | |
| letter amino acid codes. Output follows the IUPAC standard (including | |
| ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for | |
| "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. | |
| Any unknown character (including possible gap characters), is changed | |
| into '-' by default. | |
| e.g. | |
| >>> from Bio.SeqUtils import seq1 | |
| >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") | |
| 'MAIVMGRWKGAR*' | |
| The input is case insensitive, e.g. | |
| >>> from Bio.SeqUtils import seq1 | |
| >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") | |
| 'MAIVMGRWKGAR*' | |
| You can set a custom translation of the codon termination code using the | |
| dictionary "custom_map" argument (defaulting to {'Ter': '*'}), e.g. | |
| >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla***", custom_map={"***": "*"}) | |
| 'MAIVMGRWKGA*' | |
| You can also set a custom translation for non-amino acid characters, such | |
| as '-', using the "undef_code" argument, e.g. | |
| >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') | |
| 'MAIVMGRWKGA??R*' | |
| If not given, "undef_code" defaults to "X", e.g. | |
| >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") | |
| 'MAIVMGRWKGAXXR*' | |
| """ | |
| if custom_map is None: | |
| custom_map = {"Ter": "*"} | |
| # reverse map of threecode | |
| # upper() on all keys to enable caps-insensitive input seq handling | |
| onecode = {k.upper(): v for k, v in IUPACData.protein_letters_3to1_extended.items()} | |
| # add the given termination codon code and custom maps | |
| onecode.update((k.upper(), v) for k, v in custom_map.items()) | |
| seqlist = [seq[3 * i : 3 * (i + 1)] for i in range(len(seq) // 3)] | |
| return "".join(onecode.get(aa.upper(), undef_code) for aa in seqlist) | |
| ###################################### | |
| # Mixed ??? | |
| ###################### | |
| def molecular_weight( | |
| seq, seq_type="DNA", double_stranded=False, circular=False, monoisotopic=False | |
| ): | |
| """Calculate the molecular mass of DNA, RNA or protein sequences as float. | |
| Only unambiguous letters are allowed. Nucleotide sequences are assumed to | |
| have a 5' phosphate. | |
| Arguments: | |
| - seq: string, Seq, or SeqRecord object. | |
| - seq_type: The default is to assume DNA; override this with a string | |
| "DNA", "RNA", or "protein". | |
| - double_stranded: Calculate the mass for the double stranded molecule? | |
| - circular: Is the molecule circular (has no ends)? | |
| - monoisotopic: Use the monoisotopic mass tables? | |
| >>> print("%0.2f" % molecular_weight("AGC")) | |
| 949.61 | |
| >>> print("%0.2f" % molecular_weight(Seq("AGC"))) | |
| 949.61 | |
| However, it is better to be explicit - for example with strings: | |
| >>> print("%0.2f" % molecular_weight("AGC", "DNA")) | |
| 949.61 | |
| >>> print("%0.2f" % molecular_weight("AGC", "RNA")) | |
| 997.61 | |
| >>> print("%0.2f" % molecular_weight("AGC", "protein")) | |
| 249.29 | |
| """ | |
| try: | |
| seq = seq.seq | |
| except AttributeError: # not a SeqRecord object | |
| pass | |
| seq = "".join(str(seq).split()).upper() # Do the minimum formatting | |
| if seq_type == "DNA": | |
| if monoisotopic: | |
| weight_table = IUPACData.monoisotopic_unambiguous_dna_weights | |
| else: | |
| weight_table = IUPACData.unambiguous_dna_weights | |
| elif seq_type == "RNA": | |
| if monoisotopic: | |
| weight_table = IUPACData.monoisotopic_unambiguous_rna_weights | |
| else: | |
| weight_table = IUPACData.unambiguous_rna_weights | |
| elif seq_type == "protein": | |
| if monoisotopic: | |
| weight_table = IUPACData.monoisotopic_protein_weights | |
| else: | |
| weight_table = IUPACData.protein_weights | |
| else: | |
| raise ValueError(f"Allowed seq_types are DNA, RNA or protein, not {seq_type!r}") | |
| if monoisotopic: | |
| water = 18.010565 | |
| else: | |
| water = 18.0153 | |
| try: | |
| weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water | |
| if circular: | |
| weight -= water | |
| except KeyError as e: | |
| raise ValueError( | |
| f"'{e}' is not a valid unambiguous letter for {seq_type}" | |
| ) from None | |
| if double_stranded: | |
| if seq_type == "protein": | |
| raise ValueError("protein sequences cannot be double-stranded") | |
| elif seq_type == "DNA": | |
| seq = complement(seq, inplace=False) # TODO: remove inplace=False | |
| elif seq_type == "RNA": | |
| seq = complement_rna(seq) | |
| weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water | |
| if circular: | |
| weight -= water | |
| return weight | |
| def six_frame_translations(seq, genetic_code=1): | |
| """Return pretty string showing the 6 frame translations and GC content. | |
| Nice looking 6 frame translation with GC content - code from xbbtools | |
| similar to DNA Striders six-frame translation | |
| >>> from Bio.SeqUtils import six_frame_translations | |
| >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) | |
| GC_Frame: a:5 t:0 g:8 c:5 | |
| Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC | |
| <BLANKLINE> | |
| <BLANKLINE> | |
| 1/1 | |
| G H C N G P L | |
| W P L * W A A | |
| M A I V M G R * | |
| auggccauuguaaugggccgcuga 54 % | |
| uaccgguaacauuacccggcgacu | |
| A M T I P R Q | |
| H G N Y H A A S | |
| P W Q L P G S | |
| <BLANKLINE> | |
| <BLANKLINE> | |
| """ # noqa for pep8 W291 trailing whitespace | |
| from Bio.Seq import reverse_complement, reverse_complement_rna, translate | |
| if "u" in seq.lower(): | |
| anti = reverse_complement_rna(seq) | |
| else: | |
| anti = reverse_complement(seq, inplace=False) # TODO: remove inplace=False | |
| comp = anti[::-1] | |
| length = len(seq) | |
| frames = {} | |
| for i in range(0, 3): | |
| fragment_length = 3 * ((length - i) // 3) | |
| frames[i + 1] = translate(seq[i : i + fragment_length], genetic_code) | |
| frames[-(i + 1)] = translate(anti[i : i + fragment_length], genetic_code)[::-1] | |
| # create header | |
| if length > 20: | |
| short = f"{seq[:10]} ... {seq[-10:]}" | |
| else: | |
| short = seq | |
| header = "GC_Frame:" | |
| for nt in ["a", "t", "g", "c"]: | |
| header += " %s:%d" % (nt, seq.count(nt.upper())) | |
| header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % ( | |
| short.lower(), | |
| length, | |
| GC(seq), | |
| ) | |
| res = header | |
| for i in range(0, length, 60): | |
| subseq = seq[i : i + 60] | |
| csubseq = comp[i : i + 60] | |
| p = i // 3 | |
| res += "%d/%d\n" % (i + 1, i / 3 + 1) | |
| res += " " + " ".join(frames[3][p : p + 20]) + "\n" | |
| res += " " + " ".join(frames[2][p : p + 20]) + "\n" | |
| res += " ".join(frames[1][p : p + 20]) + "\n" | |
| # seq | |
| res += subseq.lower() + "%5d %%\n" % int(GC(subseq)) | |
| res += csubseq.lower() + "\n" | |
| # - frames | |
| res += " ".join(frames[-2][p : p + 20]) + "\n" | |
| res += " " + " ".join(frames[-1][p : p + 20]) + "\n" | |
| res += " " + " ".join(frames[-3][p : p + 20]) + "\n\n" | |
| return res | |
| class CodonAdaptationIndex(dict): | |
| """A codon adaptation index (CAI) implementation. | |
| Implements the codon adaptation index (CAI) described by Sharp and | |
| Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). | |
| """ | |
| def __init__(self, sequences, table=standard_dna_table): | |
| """Generate a codon adaptiveness table from the coding DNA sequences. | |
| This calculates the relative adaptiveness of each codon (w_ij) as | |
| defined by Sharp & Li (Nucleic Acids Research 15(3): 1281-1295 (1987)) | |
| from the provided codon DNA sequences. | |
| Arguments: | |
| - sequences: An iterable over DNA sequences, which may be plain | |
| strings, Seq objects, MutableSeq objects, or SeqRecord | |
| objects. | |
| - table: A Bio.Data.CodonTable.CodonTable object defining the | |
| genetic code. By default, the standard genetic code is | |
| used. | |
| """ | |
| codons = {aminoacid: [] for aminoacid in table.protein_alphabet} | |
| for codon, aminoacid in table.forward_table.items(): | |
| codons[aminoacid].append(codon) | |
| synonymous_codons = tuple(list(codons.values()) + [table.stop_codons]) | |
| # count codon occurrences in the sequences. | |
| counts = {c1 + c2 + c3: 0 for c1 in "ACGT" for c2 in "ACGT" for c3 in "ACGT"} | |
| self.update(counts) # just to ensure that the dictionary is sorted | |
| # iterate over sequence and count the codons | |
| for sequence in sequences: | |
| try: # SeqRecord | |
| name = sequence.id | |
| sequence = sequence.seq | |
| except AttributeError: # str, Seq, or MutableSeq | |
| name = None | |
| sequence = sequence.upper() | |
| for i in range(0, len(sequence), 3): | |
| codon = sequence[i : i + 3] | |
| try: | |
| counts[codon] += 1 | |
| except KeyError: | |
| if name is None: | |
| message = f"illegal codon '{codon}'" | |
| else: | |
| message = f"illegal codon '{codon}' in gene {name}" | |
| raise ValueError(message) from None | |
| # Following the description in the original paper, we use a value | |
| # of 0.5 for codons that do not appear in the reference sequences. | |
| for codon, count in counts.items(): | |
| if count == 0: | |
| counts[codon] = 0.5 | |
| for codons in synonymous_codons: | |
| denominator = max(counts[codon] for codon in codons) | |
| for codon in codons: | |
| self[codon] = counts[codon] / denominator | |
| def calculate(self, sequence): | |
| """Calculate and return the CAI (float) for the provided DNA sequence.""" | |
| cai_value, cai_length = 0, 0 | |
| try: | |
| sequence = sequence.seq # SeqRecord | |
| except AttributeError: | |
| pass # str, Seq, or MutableSeq | |
| sequence = sequence.upper() | |
| for i in range(0, len(sequence), 3): | |
| codon = sequence[i : i + 3] | |
| if codon in ["ATG", "TGG"]: | |
| # Exclude these two codons as their index is always one. | |
| continue | |
| try: | |
| cai_value += log(self[codon]) | |
| except KeyError: | |
| if codon in ["TGA", "TAA", "TAG"]: | |
| # Stop codon, which is valid but may be missing from the index. | |
| continue | |
| raise TypeError(f"illegal codon in sequence: {codon}") from None | |
| else: | |
| cai_length += 1 | |
| return exp(cai_value / cai_length) | |
| def __str__(self): | |
| lines = [] | |
| for codon, value in self.items(): | |
| line = f"{codon}\t{value:.3f}" | |
| lines.append(line) | |
| return "\n".join(lines) + "\n" | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |