File size: 4,869 Bytes
99f834c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Codon Optimization — optimize CDS codon usage for target organism.

Demo-level implementation that replaces rare codons with frequent ones
based on the organism's codon usage table.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

from core.analysis.cai import CODON_TABLES, calculate_cai


# Genetic code
CODON_TABLE = {
    "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
    "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
    "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
    "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
    "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
    "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
    "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
    "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
    "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
    "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
    "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
    "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
    "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
}

AA_TO_CODONS: Dict[str, List[str]] = {}
for codon, aa in CODON_TABLE.items():
    AA_TO_CODONS.setdefault(aa, []).append(codon)


@dataclass
class OptimizationResult:
    """Result of codon optimization."""
    original_cds: str
    optimized_cds: str
    original_cai: float
    optimized_cai: float
    organism: str
    codons_changed: int
    total_codons: int
    changes: List[str] = field(default_factory=list)


def optimize_codons(
    cds: str,
    organism: str = "human",
    min_cai_target: float = 0.8,
    strategy: str = "match_host",
) -> OptimizationResult:
    """
    Optimize codon usage of a CDS for the target organism.

    Parameters
    ----------
    cds : str
        Coding DNA sequence.
    organism : str
        Target organism key.
    min_cai_target : float
        Target minimum CAI.
    strategy : str
        "match_host" — replace rare with frequent.
        "harmonize" — preserve relative usage.
        "balance" — avoid most common to prevent tRNA depletion.

    Returns
    -------
    OptimizationResult
    """
    seq = cds.upper().replace("U", "T")
    organism_key = organism.lower().replace(" ", "").replace(".", "")

    # Map organism names to table keys
    org_map = {
        "human": "human",
        "mouse": "human",  # similar codon bias
        "ecoli": "ecoli",
        "cho": "human",    # similar to human
        "yeast": "human",  # fallback
        "zebrafish": "human",
    }
    table_key = org_map.get(organism_key, "human")
    table = CODON_TABLES.get(table_key, CODON_TABLES["human"])

    # Calculate original CAI
    try:
        original_cai = calculate_cai(seq, table_key)
    except Exception:
        original_cai = 0.0

    # Split into codons
    codons = [seq[i:i+3] for i in range(0, len(seq) - len(seq) % 3, 3)]
    optimized = list(codons)
    changes = []
    codons_changed = 0

    stop_codons = {"TAA", "TAG", "TGA"}

    for i, codon in enumerate(codons):
        aa = CODON_TABLE.get(codon, "?")
        if aa == "?" or aa == "*":
            continue  # skip unknown and stop codons

        w = table.get(codon, 0.5)
        if w >= 0.8:
            continue  # already a good codon

        # Find best alternative codon for this amino acid
        alternatives = [(c, table.get(c, 0.0)) for c in AA_TO_CODONS.get(aa, []) if c not in stop_codons]
        if not alternatives:
            continue

        if strategy == "match_host":
            # Pick the most frequent codon
            best = max(alternatives, key=lambda x: x[1])
        elif strategy == "balance":
            # Pick a moderately frequent codon (avoid the very top)
            sorted_alts = sorted(alternatives, key=lambda x: x[1], reverse=True)
            best = sorted_alts[min(1, len(sorted_alts) - 1)]
        else:  # harmonize
            # Keep codons with similar relative frequency
            best = max(alternatives, key=lambda x: x[1])

        if best[0] != codon and best[1] > w:
            optimized[i] = best[0]
            changes.append(f"Pos {i + 1}: {codon}{best[0]} ({aa}, {w:.2f}{best[1]:.2f})")
            codons_changed += 1

    optimized_seq = "".join(optimized)

    # Calculate optimized CAI
    try:
        optimized_cai = calculate_cai(optimized_seq, table_key)
    except Exception:
        optimized_cai = 0.0

    return OptimizationResult(
        original_cds=cds,
        optimized_cds=optimized_seq,
        original_cai=original_cai,
        optimized_cai=optimized_cai,
        organism=organism,
        codons_changed=codons_changed,
        total_codons=len(codons),
        changes=changes,
    )