File size: 4,597 Bytes
1518606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import math
from collections import Counter
from itertools import chain


_FORWARD_TABLE_11 = {
    "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
    "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
    "TAT": "Y", "TAC": "Y", "TGT": "C", "TGC": "C", "TGG": "W",
    "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
    "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
    "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
    "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
    "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
    "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
    "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
    "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
    "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
}
_STOP_CODONS_11 = {"TAA", "TAG", "TGA"}


def _build_synonymous_codons(forward_table):
    codons_for_amino_acid = {}
    for codon, amino_acid in forward_table.items():
        codons_for_amino_acid.setdefault(amino_acid, []).append(codon)
    return {
        codon: codons_for_amino_acid[forward_table[codon]]
        for codon in forward_table
    }


_SYNONYMOUS_CODONS_11 = _build_synonymous_codons(_FORWARD_TABLE_11)
_NON_SYNONYMOUS_CODONS_11 = {
    codon for codon, group in _SYNONYMOUS_CODONS_11.items() if len(group) == 1
}


def _require_genetic_code_11(genetic_code):
    if genetic_code != 11:
        raise NotImplementedError("This bundled CAI fallback currently supports only genetic code 11.")


def _geometric_mean(values):
    if not values:
        return float("nan")
    return math.exp(sum(math.log(value) for value in values) / len(values))


def RSCU(sequences, genetic_code=11):
    _require_genetic_code_11(genetic_code)

    if not isinstance(sequences, (list, tuple)):
        raise ValueError(
            "Be sure to pass a list of sequences, not a single sequence. "
            "To find the RSCU of a single sequence, pass it as a one element list."
        )

    for sequence in sequences:
        if not sequence:
            raise ValueError("Input sequence cannot be empty")
        if len(sequence) % 3 != 0:
            raise ValueError("Input sequence not divisible by three")

    codon_streams = (
        (sequence[i : i + 3].upper() for i in range(0, len(sequence), 3))
        for sequence in sequences
    )
    counts = Counter(chain.from_iterable(codon_streams))

    for codon in _FORWARD_TABLE_11:
        if counts[codon] == 0:
            counts[codon] = 0.5

    result = {}
    for codon in _FORWARD_TABLE_11:
        codon_group = _SYNONYMOUS_CODONS_11[codon]
        result[codon] = counts[codon] / (
            (len(codon_group) ** -1) * sum(counts[group_codon] for group_codon in codon_group)
        )
    return result


def relative_adaptiveness(sequences=None, RSCUs=None, genetic_code=11):
    _require_genetic_code_11(genetic_code)

    if sum([bool(sequences), bool(RSCUs)]) != 1:
        raise TypeError("Must provide either reference sequences or RSCU dictionary")

    if sequences:
        RSCUs = RSCU(sequences, genetic_code=genetic_code)

    return {
        codon: value / max(RSCUs[group_codon] for group_codon in _SYNONYMOUS_CODONS_11[codon])
        for codon, value in RSCUs.items()
    }


def CAI(sequence, weights=None, RSCUs=None, reference=None, genetic_code=11):
    _require_genetic_code_11(genetic_code)

    if sum([bool(reference), bool(RSCUs), bool(weights)]) != 1:
        raise TypeError(
            "Must provide either reference sequences, or RSCU dictionary, or weights"
        )
    if not sequence:
        raise ValueError("Sequence cannot be empty")
    if len(sequence) % 3 != 0:
        raise ValueError("Input sequence not divisible by three")

    sequence = sequence.upper()
    codons = [sequence[i : i + 3] for i in range(0, len(sequence), 3)]

    if reference:
        weights = relative_adaptiveness(sequences=reference, genetic_code=genetic_code)
    elif RSCUs:
        weights = relative_adaptiveness(RSCUs=RSCUs, genetic_code=genetic_code)

    sequence_weights = []
    for codon in codons:
        if codon in _NON_SYNONYMOUS_CODONS_11 or codon in _STOP_CODONS_11:
            continue
        if codon not in weights:
            raise KeyError(
                "Bad weights dictionary passed: missing weight for codon "
                + str(codon)
                + "."
            )
        sequence_weights.append(weights[codon])

    return float(_geometric_mean(sequence_weights))