ColiFormer / CAI.py
saketh11's picture
Vendor local CAI module for startup reliability
1518606 verified
import math
from collections import Counter
from itertools import chain
_FORWARD_TABLE_11 = {
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
"TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
"TAT": "Y", "TAC": "Y", "TGT": "C", "TGC": "C", "TGG": "W",
"CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
"CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
"CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
"CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
"ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
"ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
"AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
"AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
"GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
"GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
"GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
"GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
}
_STOP_CODONS_11 = {"TAA", "TAG", "TGA"}
def _build_synonymous_codons(forward_table):
codons_for_amino_acid = {}
for codon, amino_acid in forward_table.items():
codons_for_amino_acid.setdefault(amino_acid, []).append(codon)
return {
codon: codons_for_amino_acid[forward_table[codon]]
for codon in forward_table
}
_SYNONYMOUS_CODONS_11 = _build_synonymous_codons(_FORWARD_TABLE_11)
_NON_SYNONYMOUS_CODONS_11 = {
codon for codon, group in _SYNONYMOUS_CODONS_11.items() if len(group) == 1
}
def _require_genetic_code_11(genetic_code):
if genetic_code != 11:
raise NotImplementedError("This bundled CAI fallback currently supports only genetic code 11.")
def _geometric_mean(values):
if not values:
return float("nan")
return math.exp(sum(math.log(value) for value in values) / len(values))
def RSCU(sequences, genetic_code=11):
_require_genetic_code_11(genetic_code)
if not isinstance(sequences, (list, tuple)):
raise ValueError(
"Be sure to pass a list of sequences, not a single sequence. "
"To find the RSCU of a single sequence, pass it as a one element list."
)
for sequence in sequences:
if not sequence:
raise ValueError("Input sequence cannot be empty")
if len(sequence) % 3 != 0:
raise ValueError("Input sequence not divisible by three")
codon_streams = (
(sequence[i : i + 3].upper() for i in range(0, len(sequence), 3))
for sequence in sequences
)
counts = Counter(chain.from_iterable(codon_streams))
for codon in _FORWARD_TABLE_11:
if counts[codon] == 0:
counts[codon] = 0.5
result = {}
for codon in _FORWARD_TABLE_11:
codon_group = _SYNONYMOUS_CODONS_11[codon]
result[codon] = counts[codon] / (
(len(codon_group) ** -1) * sum(counts[group_codon] for group_codon in codon_group)
)
return result
def relative_adaptiveness(sequences=None, RSCUs=None, genetic_code=11):
_require_genetic_code_11(genetic_code)
if sum([bool(sequences), bool(RSCUs)]) != 1:
raise TypeError("Must provide either reference sequences or RSCU dictionary")
if sequences:
RSCUs = RSCU(sequences, genetic_code=genetic_code)
return {
codon: value / max(RSCUs[group_codon] for group_codon in _SYNONYMOUS_CODONS_11[codon])
for codon, value in RSCUs.items()
}
def CAI(sequence, weights=None, RSCUs=None, reference=None, genetic_code=11):
_require_genetic_code_11(genetic_code)
if sum([bool(reference), bool(RSCUs), bool(weights)]) != 1:
raise TypeError(
"Must provide either reference sequences, or RSCU dictionary, or weights"
)
if not sequence:
raise ValueError("Sequence cannot be empty")
if len(sequence) % 3 != 0:
raise ValueError("Input sequence not divisible by three")
sequence = sequence.upper()
codons = [sequence[i : i + 3] for i in range(0, len(sequence), 3)]
if reference:
weights = relative_adaptiveness(sequences=reference, genetic_code=genetic_code)
elif RSCUs:
weights = relative_adaptiveness(RSCUs=RSCUs, genetic_code=genetic_code)
sequence_weights = []
for codon in codons:
if codon in _NON_SYNONYMOUS_CODONS_11 or codon in _STOP_CODONS_11:
continue
if codon not in weights:
raise KeyError(
"Bad weights dictionary passed: missing weight for codon "
+ str(codon)
+ "."
)
sequence_weights.append(weights[codon])
return float(_geometric_mean(sequence_weights))