File size: 4,869 Bytes
99f834c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | """
Codon Optimization — optimize CDS codon usage for target organism.
Demo-level implementation that replaces rare codons with frequent ones
based on the organism's codon usage table.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from core.analysis.cai import CODON_TABLES, calculate_cai
# Genetic code
CODON_TABLE = {
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
"CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
"ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
"GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
"TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
"CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
"ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
"GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
"TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
"CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
"AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
"GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
"TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
"CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
"AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
"GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
}
AA_TO_CODONS: Dict[str, List[str]] = {}
for codon, aa in CODON_TABLE.items():
AA_TO_CODONS.setdefault(aa, []).append(codon)
@dataclass
class OptimizationResult:
"""Result of codon optimization."""
original_cds: str
optimized_cds: str
original_cai: float
optimized_cai: float
organism: str
codons_changed: int
total_codons: int
changes: List[str] = field(default_factory=list)
def optimize_codons(
cds: str,
organism: str = "human",
min_cai_target: float = 0.8,
strategy: str = "match_host",
) -> OptimizationResult:
"""
Optimize codon usage of a CDS for the target organism.
Parameters
----------
cds : str
Coding DNA sequence.
organism : str
Target organism key.
min_cai_target : float
Target minimum CAI.
strategy : str
"match_host" — replace rare with frequent.
"harmonize" — preserve relative usage.
"balance" — avoid most common to prevent tRNA depletion.
Returns
-------
OptimizationResult
"""
seq = cds.upper().replace("U", "T")
organism_key = organism.lower().replace(" ", "").replace(".", "")
# Map organism names to table keys
org_map = {
"human": "human",
"mouse": "human", # similar codon bias
"ecoli": "ecoli",
"cho": "human", # similar to human
"yeast": "human", # fallback
"zebrafish": "human",
}
table_key = org_map.get(organism_key, "human")
table = CODON_TABLES.get(table_key, CODON_TABLES["human"])
# Calculate original CAI
try:
original_cai = calculate_cai(seq, table_key)
except Exception:
original_cai = 0.0
# Split into codons
codons = [seq[i:i+3] for i in range(0, len(seq) - len(seq) % 3, 3)]
optimized = list(codons)
changes = []
codons_changed = 0
stop_codons = {"TAA", "TAG", "TGA"}
for i, codon in enumerate(codons):
aa = CODON_TABLE.get(codon, "?")
if aa == "?" or aa == "*":
continue # skip unknown and stop codons
w = table.get(codon, 0.5)
if w >= 0.8:
continue # already a good codon
# Find best alternative codon for this amino acid
alternatives = [(c, table.get(c, 0.0)) for c in AA_TO_CODONS.get(aa, []) if c not in stop_codons]
if not alternatives:
continue
if strategy == "match_host":
# Pick the most frequent codon
best = max(alternatives, key=lambda x: x[1])
elif strategy == "balance":
# Pick a moderately frequent codon (avoid the very top)
sorted_alts = sorted(alternatives, key=lambda x: x[1], reverse=True)
best = sorted_alts[min(1, len(sorted_alts) - 1)]
else: # harmonize
# Keep codons with similar relative frequency
best = max(alternatives, key=lambda x: x[1])
if best[0] != codon and best[1] > w:
optimized[i] = best[0]
changes.append(f"Pos {i + 1}: {codon} → {best[0]} ({aa}, {w:.2f} → {best[1]:.2f})")
codons_changed += 1
optimized_seq = "".join(optimized)
# Calculate optimized CAI
try:
optimized_cai = calculate_cai(optimized_seq, table_key)
except Exception:
optimized_cai = 0.0
return OptimizationResult(
original_cds=cds,
optimized_cds=optimized_seq,
original_cai=original_cai,
optimized_cai=optimized_cai,
organism=organism,
codons_changed=codons_changed,
total_codons=len(codons),
changes=changes,
)
|