mrna-design-studio / core /sequence_tools /codon_optimizer.py
offtargeteffect's picture
Deploy mRNA Design Studio (Docker SDK)
99f834c verified
Raw
History Blame Contribute Delete
4.87 kB
"""
Codon Optimization — optimize CDS codon usage for target organism.
Demo-level implementation that replaces rare codons with frequent ones
based on the organism's codon usage table.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from core.analysis.cai import CODON_TABLES, calculate_cai
# Genetic code
CODON_TABLE = {
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
"CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
"ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
"GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
"TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
"CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
"ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
"GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
"TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
"CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
"AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
"GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
"TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
"CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
"AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
"GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
}
AA_TO_CODONS: Dict[str, List[str]] = {}
for codon, aa in CODON_TABLE.items():
AA_TO_CODONS.setdefault(aa, []).append(codon)
@dataclass
class OptimizationResult:
"""Result of codon optimization."""
original_cds: str
optimized_cds: str
original_cai: float
optimized_cai: float
organism: str
codons_changed: int
total_codons: int
changes: List[str] = field(default_factory=list)
def optimize_codons(
cds: str,
organism: str = "human",
min_cai_target: float = 0.8,
strategy: str = "match_host",
) -> OptimizationResult:
"""
Optimize codon usage of a CDS for the target organism.
Parameters
----------
cds : str
Coding DNA sequence.
organism : str
Target organism key.
min_cai_target : float
Target minimum CAI.
strategy : str
"match_host" — replace rare with frequent.
"harmonize" — preserve relative usage.
"balance" — avoid most common to prevent tRNA depletion.
Returns
-------
OptimizationResult
"""
seq = cds.upper().replace("U", "T")
organism_key = organism.lower().replace(" ", "").replace(".", "")
# Map organism names to table keys
org_map = {
"human": "human",
"mouse": "human", # similar codon bias
"ecoli": "ecoli",
"cho": "human", # similar to human
"yeast": "human", # fallback
"zebrafish": "human",
}
table_key = org_map.get(organism_key, "human")
table = CODON_TABLES.get(table_key, CODON_TABLES["human"])
# Calculate original CAI
try:
original_cai = calculate_cai(seq, table_key)
except Exception:
original_cai = 0.0
# Split into codons
codons = [seq[i:i+3] for i in range(0, len(seq) - len(seq) % 3, 3)]
optimized = list(codons)
changes = []
codons_changed = 0
stop_codons = {"TAA", "TAG", "TGA"}
for i, codon in enumerate(codons):
aa = CODON_TABLE.get(codon, "?")
if aa == "?" or aa == "*":
continue # skip unknown and stop codons
w = table.get(codon, 0.5)
if w >= 0.8:
continue # already a good codon
# Find best alternative codon for this amino acid
alternatives = [(c, table.get(c, 0.0)) for c in AA_TO_CODONS.get(aa, []) if c not in stop_codons]
if not alternatives:
continue
if strategy == "match_host":
# Pick the most frequent codon
best = max(alternatives, key=lambda x: x[1])
elif strategy == "balance":
# Pick a moderately frequent codon (avoid the very top)
sorted_alts = sorted(alternatives, key=lambda x: x[1], reverse=True)
best = sorted_alts[min(1, len(sorted_alts) - 1)]
else: # harmonize
# Keep codons with similar relative frequency
best = max(alternatives, key=lambda x: x[1])
if best[0] != codon and best[1] > w:
optimized[i] = best[0]
changes.append(f"Pos {i + 1}: {codon}{best[0]} ({aa}, {w:.2f}{best[1]:.2f})")
codons_changed += 1
optimized_seq = "".join(optimized)
# Calculate optimized CAI
try:
optimized_cai = calculate_cai(optimized_seq, table_key)
except Exception:
optimized_cai = 0.0
return OptimizationResult(
original_cds=cds,
optimized_cds=optimized_seq,
original_cai=original_cai,
optimized_cai=optimized_cai,
organism=organism,
codons_changed=codons_changed,
total_codons=len(codons),
changes=changes,
)