codon-optimizer / indices.py
joeyisgoed's picture
Fix GC content: add organism-specific targets and stronger penalties
34c14c8 verified
"""
Optimization indices based on GenScript patent WO2020024917A1.
Implements Harmony Index, Codon Context Index, and Outlier Index.
Includes mRNA secondary structure prediction using seqfold.
"""
import math
import re
from collections import Counter
from typing import Dict, List, Tuple, Optional
from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table
# Try to import seqfold for mRNA structure prediction
try:
from seqfold import dg, fold, dot_bracket
SEQFOLD_AVAILABLE = True
except ImportError:
SEQFOLD_AVAILABLE = False
def cosine_distance(v1: List[float], v2: List[float]) -> float:
"""Calculate cosine distance between two vectors."""
dot_product = sum(a * b for a, b in zip(v1, v2))
norm1 = math.sqrt(sum(a * a for a in v1))
norm2 = math.sqrt(sum(b * b for b in v2))
if norm1 == 0 or norm2 == 0:
return 1.0
similarity = dot_product / (norm1 * norm2)
return 1.0 - similarity
def euclidean_distance(v1: List[float], v2: List[float]) -> float:
"""Calculate normalized Euclidean distance between two vectors."""
if len(v1) != len(v2):
raise ValueError("Vectors must have same length")
squared_diff = sum((a - b) ** 2 for a, b in zip(v1, v2))
max_dist = math.sqrt(len(v1)) # Max possible distance when all elements differ by 1
return math.sqrt(squared_diff) / max_dist if max_dist > 0 else 0
def sequence_to_codons(dna_sequence: str) -> List[str]:
"""Convert DNA sequence to list of codons."""
dna_sequence = dna_sequence.upper().replace(" ", "").replace("\n", "")
codons = []
for i in range(0, len(dna_sequence) - 2, 3):
codons.append(dna_sequence[i:i+3])
return codons
def codons_to_protein(codons: List[str]) -> str:
"""Convert codon list to protein sequence."""
protein = ""
for codon in codons:
aa = CODON_TO_AA.get(codon, 'X')
if aa != '*':
protein += aa
return protein
def protein_to_codons_random(protein: str, codon_table: dict) -> List[str]:
"""Convert protein to codons using weighted random selection based on usage table."""
import random
codons = []
for aa in protein.upper():
if aa not in AA_TO_CODONS:
continue
synonymous = AA_TO_CODONS[aa]
# Weight by codon usage
weights = [codon_table.get(c, 0.1) for c in synonymous]
total = sum(weights)
weights = [w / total for w in weights]
codon = random.choices(synonymous, weights=weights, k=1)[0]
codons.append(codon)
return codons
class HarmonyIndex:
"""
Harmony Index (H) measures how well the codon usage in the candidate sequence
matches the codon usage distribution of highly-expressed genes in the host.
H = 1 - D(F_hs, F_ts)
Where:
- F_hs = codon frequency vector from highly-expressed genes (reference)
- F_ts = codon frequency vector from target sequence
- D = distance function (cosine or euclidean)
"""
def __init__(self, organism: str):
self.codon_table = get_codon_table(organism)
self.reference_freqs = self._build_reference_vector()
def _build_reference_vector(self) -> List[float]:
"""Build reference frequency vector from codon table (59 codons, excluding stops and single-codon AAs)."""
# All codons except stop codons
codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*']
return [self.codon_table.get(c, 0) for c in sorted(codons)]
def _build_sequence_vector(self, codons: List[str]) -> List[float]:
"""Build frequency vector from a codon sequence."""
# Count codons
codon_counts = Counter(codons)
total = len(codons)
if total == 0:
return [0.0] * 61 # 64 - 3 stop codons
# Build vector matching reference order (all codons except stops)
all_codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*']
# Calculate frequencies per amino acid group
freqs = []
for codon in sorted(all_codons):
aa = CODON_TO_AA[codon]
synonymous = AA_TO_CODONS[aa]
aa_total = sum(codon_counts.get(c, 0) for c in synonymous)
if aa_total > 0:
freq = codon_counts.get(codon, 0) / aa_total
else:
freq = 0.0
freqs.append(freq)
return freqs
def calculate(self, codons: List[str]) -> float:
"""
Calculate Harmony Index for a codon sequence.
Returns value between 0 and 1, where 1 is perfect harmony.
"""
target_freqs = self._build_sequence_vector(codons)
distance = cosine_distance(self.reference_freqs, target_freqs)
return 1.0 - distance
class CodonContextIndex:
"""
Codon Context Index (CC) evaluates how well codon pairs in the candidate
sequence match the codon pair preferences of highly-expressed genes.
CC = 1 - D(F_hcc, F_tcc)
Where:
- F_hcc = codon-pair frequency vector from reference
- F_tcc = codon-pair frequency vector from target sequence
"""
def __init__(self, organism: str):
self.organism = organism
self.codon_table = get_codon_table(organism)
# For simplicity, we use a basic model based on individual codon preferences
# A full implementation would use experimental codon pair frequency data
def _calculate_pair_score(self, codon1: str, codon2: str) -> float:
"""Calculate preference score for a codon pair based on individual codon usage."""
freq1 = self.codon_table.get(codon1, 0.1)
freq2 = self.codon_table.get(codon2, 0.1)
# Geometric mean of frequencies as a simple pair score
return math.sqrt(freq1 * freq2)
def calculate(self, codons: List[str]) -> float:
"""
Calculate Codon Context Index.
Returns value between 0 and 1, where 1 is optimal context.
"""
if len(codons) < 2:
return 1.0
total_score = 0.0
max_score = 0.0
for i in range(len(codons) - 1):
pair_score = self._calculate_pair_score(codons[i], codons[i + 1])
total_score += pair_score
max_score += 1.0 # Maximum possible score per pair
if max_score == 0:
return 1.0
return total_score / max_score
# =============================================================================
# mRNA Secondary Structure Analysis
# =============================================================================
class mRNAStructureAnalyzer:
"""
Analyzes mRNA secondary structure using seqfold.
Key metrics:
- 5' end folding energy (critical for translation initiation)
- Full sequence minimum free energy (MFE)
- Hairpin detection in sliding windows
"""
def __init__(self, temperature: float = 37.0):
"""
Initialize analyzer.
Args:
temperature: Temperature in Celsius for folding calculations
"""
self.temperature = temperature
self.available = SEQFOLD_AVAILABLE
def dna_to_rna(self, dna_seq: str) -> str:
"""Convert DNA sequence to RNA (T -> U)."""
return dna_seq.upper().replace('T', 'U')
def calculate_mfe(self, sequence: str, is_rna: bool = False, max_length: int = 200) -> float:
"""
Calculate minimum free energy (MFE) for a sequence.
Args:
sequence: DNA or RNA sequence
is_rna: If True, sequence is already RNA
max_length: Maximum sequence length to analyze (longer sequences are truncated)
Returns:
MFE in kcal/mol (more negative = more stable structure)
"""
if not self.available:
return 0.0
if not is_rna:
sequence = self.dna_to_rna(sequence)
if len(sequence) < 4:
return 0.0
# Truncate long sequences for performance (RNA folding is O(n³))
if len(sequence) > max_length:
sequence = sequence[:max_length]
try:
return dg(sequence, temp=self.temperature)
except Exception:
return 0.0
def calculate_5prime_mfe(self, sequence: str, window_size: int = 50) -> float:
"""
Calculate MFE for the 5' region (critical for translation initiation).
Stable structures at the 5' end can reduce translation by up to 250-fold.
Args:
sequence: DNA sequence
window_size: Number of nucleotides from 5' end to analyze
Returns:
MFE of 5' region in kcal/mol
"""
if len(sequence) < window_size:
return self.calculate_mfe(sequence)
return self.calculate_mfe(sequence[:window_size])
def find_hairpins(self, sequence: str, window_size: int = 60,
step: int = 30, threshold: float = -15.0) -> List[dict]:
"""
Find potential hairpin structures using sliding window analysis.
Args:
sequence: DNA sequence
window_size: Size of sliding window
step: Step size for window
threshold: MFE threshold for flagging (kcal/mol)
Returns:
List of hairpin regions with position and energy
"""
if not self.available:
return []
hairpins = []
rna_seq = self.dna_to_rna(sequence)
for i in range(0, len(rna_seq) - window_size + 1, step):
window = rna_seq[i:i + window_size]
try:
energy = dg(window, temp=self.temperature)
if energy < threshold:
hairpins.append({
'start': i,
'end': i + window_size,
'energy': energy,
'sequence': sequence[i:i + window_size]
})
except Exception:
continue
return hairpins
def get_structure_string(self, sequence: str) -> str:
"""
Get dot-bracket notation of predicted structure.
Args:
sequence: DNA sequence
Returns:
Dot-bracket string (e.g., "(((...)))")
"""
if not self.available:
return ""
rna_seq = self.dna_to_rna(sequence)
try:
structs = fold(rna_seq)
return dot_bracket(structs)
except Exception:
return ""
def analyze(self, sequence: str, fast_mode: bool = True) -> dict:
"""
Perform mRNA structure analysis.
Args:
sequence: DNA sequence
fast_mode: If True, only analyze critical regions (faster)
Returns:
Dictionary with structure metrics
"""
if not self.available:
return {
'available': False,
'full_mfe': 0.0,
'five_prime_mfe': 0.0,
'hairpin_count': 0,
'hairpins': [],
'structure_penalty': 0.0,
}
# Always calculate 5' MFE (most important for translation)
five_prime_mfe = self.calculate_5prime_mfe(sequence)
# For full MFE, only analyze first 200 nt in fast mode
if fast_mode:
full_mfe = self.calculate_mfe(sequence[:min(200, len(sequence))])
# Skip hairpin detection in fast mode for long sequences
if len(sequence) > 300:
hairpins = []
else:
hairpins = self.find_hairpins(sequence, window_size=40, step=40, threshold=-12.0)
else:
full_mfe = self.calculate_mfe(sequence, max_length=500)
hairpins = self.find_hairpins(sequence)
# Calculate structure penalty for optimization
# Penalize stable 5' structures (< -30 kcal/mol is problematic)
five_prime_penalty = 0.0
if five_prime_mfe < -30:
five_prime_penalty = (-30 - five_prime_mfe) / 10 # Scaled penalty
# Penalize multiple hairpins
hairpin_penalty = len(hairpins) * 0.1
# Total structure penalty
structure_penalty = five_prime_penalty + hairpin_penalty
return {
'available': True,
'full_mfe': full_mfe,
'five_prime_mfe': five_prime_mfe,
'hairpin_count': len(hairpins),
'hairpins': hairpins,
'structure_penalty': structure_penalty,
}
class OutlierIndex:
"""
Outlier Index (OI) quantifies negative sequence features that should be minimized.
OI = sum(w_i * f_i(x)) for various adverse features
Features include:
- GC content deviation from optimal range (organism-specific)
- Repetitive sequences
- Rare codon clusters
- mRNA secondary structure (optional - expensive to compute)
- Adverse motifs (restriction sites, splice sites, etc.)
"""
# Weights for different features - GC penalties significantly increased
WEIGHTS = {
'gc_content': 8.0, # Increased from 2.0 - global GC penalty
'gc_local': 4.0, # Increased from 1.5 - local window GC penalty
'gc_extreme': 10.0, # NEW - exponential penalty for extreme GC
'rare_codons': 2.0,
'repeats': 1.0,
'homopolymers': 1.5,
'motifs': 2.0,
'mrna_structure': 2.5,
}
# Organism-specific optimal GC content ranges
# Based on typical GC content of highly-expressed genes
GC_TARGETS = {
"Escherichia coli K12": (0.45, 0.55), # E. coli prefers moderate GC
"Homo sapiens": (0.45, 0.60), # Human - moderate to slightly high
"CHO (Chinese Hamster Ovary)": (0.45, 0.60),
"Saccharomyces cerevisiae": (0.35, 0.45), # Yeast prefers lower GC
"Mus musculus": (0.45, 0.60), # Mouse similar to human
"Insect (Spodoptera frugiperda)": (0.40, 0.55),
"Pichia pastoris": (0.38, 0.48),
"Bacillus subtilis": (0.40, 0.50),
"Drosophila melanogaster": (0.45, 0.58),
"Caenorhabditis elegans": (0.35, 0.45),
}
DEFAULT_GC_TARGET = (0.40, 0.55) # Default if organism not found
# Adverse motifs to avoid
ADVERSE_MOTIFS = [
# Shine-Dalgarno sequences (prokaryotic RBS)
'AGGAGG', 'GGAGG', 'AGGAG', 'GAGG',
# Common restriction sites
'GAATTC', # EcoRI
'GGATCC', # BamHI
'AAGCTT', # HindIII
'CTGCAG', # PstI
'GCGGCCGC', # NotI
'CATATG', # NdeI
# Splice site consensus
'GGTAAG', 'GGTGAG',
# Poly-A signals
'AATAAA', 'ATTAAA',
# AU-rich elements
'ATTTA',
]
def __init__(self, organism: str, excluded_sites: List[str] = None,
include_mrna_structure: bool = False):
"""
Initialize OutlierIndex.
Args:
organism: Target host organism
excluded_sites: Restriction sites to avoid
include_mrna_structure: If True, include mRNA folding in calculations
(expensive - set False for optimization iterations)
"""
self.organism = organism
self.codon_table = get_codon_table(organism)
self.rare_threshold = 0.10 # Codons with frequency < 10% considered rare
self.excluded_sites = excluded_sites or []
self.include_mrna_structure = include_mrna_structure
self.mrna_analyzer = mRNAStructureAnalyzer() if include_mrna_structure else None
# Get organism-specific GC target range
self.gc_target = self.GC_TARGETS.get(organism, self.DEFAULT_GC_TARGET)
self.gc_min, self.gc_max = self.gc_target
def _gc_content_penalty(self, sequence: str) -> float:
"""Calculate penalty for GC content outside organism-specific optimal range."""
if len(sequence) == 0:
return 0.0
gc_count = sequence.count('G') + sequence.count('C')
gc_percent = gc_count / len(sequence)
# Use organism-specific optimal range
if self.gc_min <= gc_percent <= self.gc_max:
return 0.0
elif gc_percent < self.gc_min:
return (self.gc_min - gc_percent) * 3 # Increased multiplier
else:
return (gc_percent - self.gc_max) * 3 # Increased multiplier
def _gc_extreme_penalty(self, sequence: str) -> float:
"""
Calculate exponential penalty for extreme GC content.
Heavily penalizes GC outside 30-70% range.
"""
if len(sequence) == 0:
return 0.0
gc_count = sequence.count('G') + sequence.count('C')
gc_percent = gc_count / len(sequence)
# No penalty within acceptable range (organism target ± 15%)
soft_min = max(0.30, self.gc_min - 0.10)
soft_max = min(0.70, self.gc_max + 0.10)
if soft_min <= gc_percent <= soft_max:
return 0.0
# Exponential penalty outside soft range
if gc_percent < soft_min:
deviation = soft_min - gc_percent
else:
deviation = gc_percent - soft_max
# Exponential: penalty grows rapidly with deviation
return (deviation * 10) ** 2
def _gc_local_penalty(self, sequence: str, window_size: int = 60) -> float:
"""Calculate penalty for local GC content outside acceptable range."""
if len(sequence) < window_size:
return 0.0
# Local range is organism target ± 10%
local_min = max(0.25, self.gc_min - 0.10)
local_max = min(0.75, self.gc_max + 0.10)
penalties = []
for i in range(0, len(sequence) - window_size + 1, 10):
window = sequence[i:i + window_size]
gc_count = window.count('G') + window.count('C')
gc_percent = gc_count / len(window)
if gc_percent < local_min:
penalties.append((local_min - gc_percent) * 2)
elif gc_percent > local_max:
penalties.append((gc_percent - local_max) * 2)
return sum(penalties) / max(len(penalties), 1) if penalties else 0.0
def _rare_codon_penalty(self, codons: List[str]) -> float:
"""Calculate penalty for rare codon usage."""
if len(codons) == 0:
return 0.0
rare_count = 0
for codon in codons:
freq = self.codon_table.get(codon, 0)
if freq < self.rare_threshold:
rare_count += 1
return rare_count / len(codons)
def _repeat_penalty(self, sequence: str) -> float:
"""Calculate penalty for direct repeats (>= 8bp)."""
penalty = 0.0
# Check for repeats
for length in range(8, min(20, len(sequence) // 2)):
for i in range(len(sequence) - 2 * length + 1):
pattern = sequence[i:i + length]
if pattern in sequence[i + length:]:
penalty += 0.1
return min(penalty, 1.0)
def _homopolymer_penalty(self, sequence: str) -> float:
"""Calculate penalty for homopolymer runs (>= 5 same nucleotides)."""
penalty = 0.0
for base in 'ATGC':
runs = re.findall(f'{base}{{5,}}', sequence)
for run in runs:
penalty += (len(run) - 4) * 0.1
return min(penalty, 1.0)
def _motif_penalty(self, sequence: str) -> float:
"""Calculate penalty for adverse motifs."""
penalty = 0.0
all_motifs = self.ADVERSE_MOTIFS + self.excluded_sites
for motif in all_motifs:
count = sequence.count(motif)
penalty += count * 0.2
return min(penalty, 1.0)
def _mrna_structure_penalty(self, sequence: str) -> float:
"""Calculate penalty for problematic mRNA secondary structures."""
if not self.include_mrna_structure or self.mrna_analyzer is None:
return 0.0
if not self.mrna_analyzer.available:
return 0.0
analysis = self.mrna_analyzer.analyze(sequence)
return analysis['structure_penalty']
def calculate(self, codons: List[str]) -> float:
"""
Calculate Outlier Index for a codon sequence.
Returns value >= 0, where 0 is optimal (no adverse features).
Lower is better.
"""
sequence = ''.join(codons)
penalties = {
'gc_content': self._gc_content_penalty(sequence),
'gc_extreme': self._gc_extreme_penalty(sequence),
'gc_local': self._gc_local_penalty(sequence),
'rare_codons': self._rare_codon_penalty(codons),
'repeats': self._repeat_penalty(sequence),
'homopolymers': self._homopolymer_penalty(sequence),
'motifs': self._motif_penalty(sequence),
'mrna_structure': self._mrna_structure_penalty(sequence),
}
total = sum(self.WEIGHTS[k] * v for k, v in penalties.items())
return total
def calculate_detailed(self, codons: List[str]) -> dict:
"""
Calculate Outlier Index with detailed breakdown.
Returns dictionary with all penalty components and mRNA structure analysis.
"""
sequence = ''.join(codons)
penalties = {
'gc_content': self._gc_content_penalty(sequence),
'gc_extreme': self._gc_extreme_penalty(sequence),
'gc_local': self._gc_local_penalty(sequence),
'rare_codons': self._rare_codon_penalty(codons),
'repeats': self._repeat_penalty(sequence),
'homopolymers': self._homopolymer_penalty(sequence),
'motifs': self._motif_penalty(sequence),
'mrna_structure': self._mrna_structure_penalty(sequence),
}
total = sum(self.WEIGHTS[k] * v for k, v in penalties.items())
# Get detailed mRNA analysis
mrna_analysis = self.mrna_analyzer.analyze(sequence)
return {
'total': total,
'penalties': penalties,
'weights': self.WEIGHTS,
'mrna_structure': mrna_analysis,
}
def calculate_cai(codons: List[str], codon_table: dict) -> float:
"""
Calculate Codon Adaptation Index (CAI).
CAI is the geometric mean of relative adaptiveness values.
"""
if len(codons) == 0:
return 0.0
# Calculate relative adaptiveness for each codon
w_values = []
for codon in codons:
aa = CODON_TO_AA.get(codon)
if aa is None or aa == '*':
continue
# Get max frequency among synonymous codons
synonymous = AA_TO_CODONS[aa]
max_freq = max(codon_table.get(c, 0.01) for c in synonymous)
# Relative adaptiveness
freq = codon_table.get(codon, 0.01)
w = freq / max_freq if max_freq > 0 else 0.01
w_values.append(max(w, 0.01)) # Avoid log(0)
if len(w_values) == 0:
return 0.0
# Geometric mean
log_sum = sum(math.log(w) for w in w_values)
cai = math.exp(log_sum / len(w_values))
return cai
def calculate_gc_content(sequence: str) -> float:
"""Calculate GC content as percentage."""
if len(sequence) == 0:
return 0.0
gc_count = sequence.count('G') + sequence.count('C')
return (gc_count / len(sequence)) * 100