Spaces:
Running
Running
File size: 4,597 Bytes
1518606 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import math
from collections import Counter
from itertools import chain
_FORWARD_TABLE_11 = {
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
"TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
"TAT": "Y", "TAC": "Y", "TGT": "C", "TGC": "C", "TGG": "W",
"CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
"CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
"CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
"CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
"ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
"ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
"AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
"AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
"GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
"GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
"GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
"GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
}
_STOP_CODONS_11 = {"TAA", "TAG", "TGA"}
def _build_synonymous_codons(forward_table):
codons_for_amino_acid = {}
for codon, amino_acid in forward_table.items():
codons_for_amino_acid.setdefault(amino_acid, []).append(codon)
return {
codon: codons_for_amino_acid[forward_table[codon]]
for codon in forward_table
}
_SYNONYMOUS_CODONS_11 = _build_synonymous_codons(_FORWARD_TABLE_11)
_NON_SYNONYMOUS_CODONS_11 = {
codon for codon, group in _SYNONYMOUS_CODONS_11.items() if len(group) == 1
}
def _require_genetic_code_11(genetic_code):
if genetic_code != 11:
raise NotImplementedError("This bundled CAI fallback currently supports only genetic code 11.")
def _geometric_mean(values):
if not values:
return float("nan")
return math.exp(sum(math.log(value) for value in values) / len(values))
def RSCU(sequences, genetic_code=11):
_require_genetic_code_11(genetic_code)
if not isinstance(sequences, (list, tuple)):
raise ValueError(
"Be sure to pass a list of sequences, not a single sequence. "
"To find the RSCU of a single sequence, pass it as a one element list."
)
for sequence in sequences:
if not sequence:
raise ValueError("Input sequence cannot be empty")
if len(sequence) % 3 != 0:
raise ValueError("Input sequence not divisible by three")
codon_streams = (
(sequence[i : i + 3].upper() for i in range(0, len(sequence), 3))
for sequence in sequences
)
counts = Counter(chain.from_iterable(codon_streams))
for codon in _FORWARD_TABLE_11:
if counts[codon] == 0:
counts[codon] = 0.5
result = {}
for codon in _FORWARD_TABLE_11:
codon_group = _SYNONYMOUS_CODONS_11[codon]
result[codon] = counts[codon] / (
(len(codon_group) ** -1) * sum(counts[group_codon] for group_codon in codon_group)
)
return result
def relative_adaptiveness(sequences=None, RSCUs=None, genetic_code=11):
_require_genetic_code_11(genetic_code)
if sum([bool(sequences), bool(RSCUs)]) != 1:
raise TypeError("Must provide either reference sequences or RSCU dictionary")
if sequences:
RSCUs = RSCU(sequences, genetic_code=genetic_code)
return {
codon: value / max(RSCUs[group_codon] for group_codon in _SYNONYMOUS_CODONS_11[codon])
for codon, value in RSCUs.items()
}
def CAI(sequence, weights=None, RSCUs=None, reference=None, genetic_code=11):
_require_genetic_code_11(genetic_code)
if sum([bool(reference), bool(RSCUs), bool(weights)]) != 1:
raise TypeError(
"Must provide either reference sequences, or RSCU dictionary, or weights"
)
if not sequence:
raise ValueError("Sequence cannot be empty")
if len(sequence) % 3 != 0:
raise ValueError("Input sequence not divisible by three")
sequence = sequence.upper()
codons = [sequence[i : i + 3] for i in range(0, len(sequence), 3)]
if reference:
weights = relative_adaptiveness(sequences=reference, genetic_code=genetic_code)
elif RSCUs:
weights = relative_adaptiveness(RSCUs=RSCUs, genetic_code=genetic_code)
sequence_weights = []
for codon in codons:
if codon in _NON_SYNONYMOUS_CODONS_11 or codon in _STOP_CODONS_11:
continue
if codon not in weights:
raise KeyError(
"Bad weights dictionary passed: missing weight for codon "
+ str(codon)
+ "."
)
sequence_weights.append(weights[codon])
return float(_geometric_mean(sequence_weights))
|