import math from collections import Counter from itertools import chain _FORWARD_TABLE_11 = { "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "TAT": "Y", "TAC": "Y", "TGT": "C", "TGC": "C", "TGG": "W", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", } _STOP_CODONS_11 = {"TAA", "TAG", "TGA"} def _build_synonymous_codons(forward_table): codons_for_amino_acid = {} for codon, amino_acid in forward_table.items(): codons_for_amino_acid.setdefault(amino_acid, []).append(codon) return { codon: codons_for_amino_acid[forward_table[codon]] for codon in forward_table } _SYNONYMOUS_CODONS_11 = _build_synonymous_codons(_FORWARD_TABLE_11) _NON_SYNONYMOUS_CODONS_11 = { codon for codon, group in _SYNONYMOUS_CODONS_11.items() if len(group) == 1 } def _require_genetic_code_11(genetic_code): if genetic_code != 11: raise NotImplementedError("This bundled CAI fallback currently supports only genetic code 11.") def _geometric_mean(values): if not values: return float("nan") return math.exp(sum(math.log(value) for value in values) / len(values)) def RSCU(sequences, genetic_code=11): _require_genetic_code_11(genetic_code) if not isinstance(sequences, (list, tuple)): raise ValueError( "Be sure to pass a list of sequences, not a single sequence. " "To find the RSCU of a single sequence, pass it as a one element list." ) for sequence in sequences: if not sequence: raise ValueError("Input sequence cannot be empty") if len(sequence) % 3 != 0: raise ValueError("Input sequence not divisible by three") codon_streams = ( (sequence[i : i + 3].upper() for i in range(0, len(sequence), 3)) for sequence in sequences ) counts = Counter(chain.from_iterable(codon_streams)) for codon in _FORWARD_TABLE_11: if counts[codon] == 0: counts[codon] = 0.5 result = {} for codon in _FORWARD_TABLE_11: codon_group = _SYNONYMOUS_CODONS_11[codon] result[codon] = counts[codon] / ( (len(codon_group) ** -1) * sum(counts[group_codon] for group_codon in codon_group) ) return result def relative_adaptiveness(sequences=None, RSCUs=None, genetic_code=11): _require_genetic_code_11(genetic_code) if sum([bool(sequences), bool(RSCUs)]) != 1: raise TypeError("Must provide either reference sequences or RSCU dictionary") if sequences: RSCUs = RSCU(sequences, genetic_code=genetic_code) return { codon: value / max(RSCUs[group_codon] for group_codon in _SYNONYMOUS_CODONS_11[codon]) for codon, value in RSCUs.items() } def CAI(sequence, weights=None, RSCUs=None, reference=None, genetic_code=11): _require_genetic_code_11(genetic_code) if sum([bool(reference), bool(RSCUs), bool(weights)]) != 1: raise TypeError( "Must provide either reference sequences, or RSCU dictionary, or weights" ) if not sequence: raise ValueError("Sequence cannot be empty") if len(sequence) % 3 != 0: raise ValueError("Input sequence not divisible by three") sequence = sequence.upper() codons = [sequence[i : i + 3] for i in range(0, len(sequence), 3)] if reference: weights = relative_adaptiveness(sequences=reference, genetic_code=genetic_code) elif RSCUs: weights = relative_adaptiveness(RSCUs=RSCUs, genetic_code=genetic_code) sequence_weights = [] for codon in codons: if codon in _NON_SYNONYMOUS_CODONS_11 or codon in _STOP_CODONS_11: continue if codon not in weights: raise KeyError( "Bad weights dictionary passed: missing weight for codon " + str(codon) + "." ) sequence_weights.append(weights[codon]) return float(_geometric_mean(sequence_weights))