""" Optimization indices based on GenScript patent WO2020024917A1. Implements Harmony Index, Codon Context Index, and Outlier Index. Includes mRNA secondary structure prediction using seqfold. """ import math import re from collections import Counter from typing import Dict, List, Tuple, Optional from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table # Try to import seqfold for mRNA structure prediction try: from seqfold import dg, fold, dot_bracket SEQFOLD_AVAILABLE = True except ImportError: SEQFOLD_AVAILABLE = False def cosine_distance(v1: List[float], v2: List[float]) -> float: """Calculate cosine distance between two vectors.""" dot_product = sum(a * b for a, b in zip(v1, v2)) norm1 = math.sqrt(sum(a * a for a in v1)) norm2 = math.sqrt(sum(b * b for b in v2)) if norm1 == 0 or norm2 == 0: return 1.0 similarity = dot_product / (norm1 * norm2) return 1.0 - similarity def euclidean_distance(v1: List[float], v2: List[float]) -> float: """Calculate normalized Euclidean distance between two vectors.""" if len(v1) != len(v2): raise ValueError("Vectors must have same length") squared_diff = sum((a - b) ** 2 for a, b in zip(v1, v2)) max_dist = math.sqrt(len(v1)) # Max possible distance when all elements differ by 1 return math.sqrt(squared_diff) / max_dist if max_dist > 0 else 0 def sequence_to_codons(dna_sequence: str) -> List[str]: """Convert DNA sequence to list of codons.""" dna_sequence = dna_sequence.upper().replace(" ", "").replace("\n", "") codons = [] for i in range(0, len(dna_sequence) - 2, 3): codons.append(dna_sequence[i:i+3]) return codons def codons_to_protein(codons: List[str]) -> str: """Convert codon list to protein sequence.""" protein = "" for codon in codons: aa = CODON_TO_AA.get(codon, 'X') if aa != '*': protein += aa return protein def protein_to_codons_random(protein: str, codon_table: dict) -> List[str]: """Convert protein to codons using weighted random selection based on usage table.""" import random codons = [] for aa in protein.upper(): if aa not in AA_TO_CODONS: continue synonymous = AA_TO_CODONS[aa] # Weight by codon usage weights = [codon_table.get(c, 0.1) for c in synonymous] total = sum(weights) weights = [w / total for w in weights] codon = random.choices(synonymous, weights=weights, k=1)[0] codons.append(codon) return codons class HarmonyIndex: """ Harmony Index (H) measures how well the codon usage in the candidate sequence matches the codon usage distribution of highly-expressed genes in the host. H = 1 - D(F_hs, F_ts) Where: - F_hs = codon frequency vector from highly-expressed genes (reference) - F_ts = codon frequency vector from target sequence - D = distance function (cosine or euclidean) """ def __init__(self, organism: str): self.codon_table = get_codon_table(organism) self.reference_freqs = self._build_reference_vector() def _build_reference_vector(self) -> List[float]: """Build reference frequency vector from codon table (59 codons, excluding stops and single-codon AAs).""" # All codons except stop codons codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*'] return [self.codon_table.get(c, 0) for c in sorted(codons)] def _build_sequence_vector(self, codons: List[str]) -> List[float]: """Build frequency vector from a codon sequence.""" # Count codons codon_counts = Counter(codons) total = len(codons) if total == 0: return [0.0] * 61 # 64 - 3 stop codons # Build vector matching reference order (all codons except stops) all_codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*'] # Calculate frequencies per amino acid group freqs = [] for codon in sorted(all_codons): aa = CODON_TO_AA[codon] synonymous = AA_TO_CODONS[aa] aa_total = sum(codon_counts.get(c, 0) for c in synonymous) if aa_total > 0: freq = codon_counts.get(codon, 0) / aa_total else: freq = 0.0 freqs.append(freq) return freqs def calculate(self, codons: List[str]) -> float: """ Calculate Harmony Index for a codon sequence. Returns value between 0 and 1, where 1 is perfect harmony. """ target_freqs = self._build_sequence_vector(codons) distance = cosine_distance(self.reference_freqs, target_freqs) return 1.0 - distance class CodonContextIndex: """ Codon Context Index (CC) evaluates how well codon pairs in the candidate sequence match the codon pair preferences of highly-expressed genes. CC = 1 - D(F_hcc, F_tcc) Where: - F_hcc = codon-pair frequency vector from reference - F_tcc = codon-pair frequency vector from target sequence """ def __init__(self, organism: str): self.organism = organism self.codon_table = get_codon_table(organism) # For simplicity, we use a basic model based on individual codon preferences # A full implementation would use experimental codon pair frequency data def _calculate_pair_score(self, codon1: str, codon2: str) -> float: """Calculate preference score for a codon pair based on individual codon usage.""" freq1 = self.codon_table.get(codon1, 0.1) freq2 = self.codon_table.get(codon2, 0.1) # Geometric mean of frequencies as a simple pair score return math.sqrt(freq1 * freq2) def calculate(self, codons: List[str]) -> float: """ Calculate Codon Context Index. Returns value between 0 and 1, where 1 is optimal context. """ if len(codons) < 2: return 1.0 total_score = 0.0 max_score = 0.0 for i in range(len(codons) - 1): pair_score = self._calculate_pair_score(codons[i], codons[i + 1]) total_score += pair_score max_score += 1.0 # Maximum possible score per pair if max_score == 0: return 1.0 return total_score / max_score # ============================================================================= # mRNA Secondary Structure Analysis # ============================================================================= class mRNAStructureAnalyzer: """ Analyzes mRNA secondary structure using seqfold. Key metrics: - 5' end folding energy (critical for translation initiation) - Full sequence minimum free energy (MFE) - Hairpin detection in sliding windows """ def __init__(self, temperature: float = 37.0): """ Initialize analyzer. Args: temperature: Temperature in Celsius for folding calculations """ self.temperature = temperature self.available = SEQFOLD_AVAILABLE def dna_to_rna(self, dna_seq: str) -> str: """Convert DNA sequence to RNA (T -> U).""" return dna_seq.upper().replace('T', 'U') def calculate_mfe(self, sequence: str, is_rna: bool = False, max_length: int = 200) -> float: """ Calculate minimum free energy (MFE) for a sequence. Args: sequence: DNA or RNA sequence is_rna: If True, sequence is already RNA max_length: Maximum sequence length to analyze (longer sequences are truncated) Returns: MFE in kcal/mol (more negative = more stable structure) """ if not self.available: return 0.0 if not is_rna: sequence = self.dna_to_rna(sequence) if len(sequence) < 4: return 0.0 # Truncate long sequences for performance (RNA folding is O(n³)) if len(sequence) > max_length: sequence = sequence[:max_length] try: return dg(sequence, temp=self.temperature) except Exception: return 0.0 def calculate_5prime_mfe(self, sequence: str, window_size: int = 50) -> float: """ Calculate MFE for the 5' region (critical for translation initiation). Stable structures at the 5' end can reduce translation by up to 250-fold. Args: sequence: DNA sequence window_size: Number of nucleotides from 5' end to analyze Returns: MFE of 5' region in kcal/mol """ if len(sequence) < window_size: return self.calculate_mfe(sequence) return self.calculate_mfe(sequence[:window_size]) def find_hairpins(self, sequence: str, window_size: int = 60, step: int = 30, threshold: float = -15.0) -> List[dict]: """ Find potential hairpin structures using sliding window analysis. Args: sequence: DNA sequence window_size: Size of sliding window step: Step size for window threshold: MFE threshold for flagging (kcal/mol) Returns: List of hairpin regions with position and energy """ if not self.available: return [] hairpins = [] rna_seq = self.dna_to_rna(sequence) for i in range(0, len(rna_seq) - window_size + 1, step): window = rna_seq[i:i + window_size] try: energy = dg(window, temp=self.temperature) if energy < threshold: hairpins.append({ 'start': i, 'end': i + window_size, 'energy': energy, 'sequence': sequence[i:i + window_size] }) except Exception: continue return hairpins def get_structure_string(self, sequence: str) -> str: """ Get dot-bracket notation of predicted structure. Args: sequence: DNA sequence Returns: Dot-bracket string (e.g., "(((...)))") """ if not self.available: return "" rna_seq = self.dna_to_rna(sequence) try: structs = fold(rna_seq) return dot_bracket(structs) except Exception: return "" def analyze(self, sequence: str, fast_mode: bool = True) -> dict: """ Perform mRNA structure analysis. Args: sequence: DNA sequence fast_mode: If True, only analyze critical regions (faster) Returns: Dictionary with structure metrics """ if not self.available: return { 'available': False, 'full_mfe': 0.0, 'five_prime_mfe': 0.0, 'hairpin_count': 0, 'hairpins': [], 'structure_penalty': 0.0, } # Always calculate 5' MFE (most important for translation) five_prime_mfe = self.calculate_5prime_mfe(sequence) # For full MFE, only analyze first 200 nt in fast mode if fast_mode: full_mfe = self.calculate_mfe(sequence[:min(200, len(sequence))]) # Skip hairpin detection in fast mode for long sequences if len(sequence) > 300: hairpins = [] else: hairpins = self.find_hairpins(sequence, window_size=40, step=40, threshold=-12.0) else: full_mfe = self.calculate_mfe(sequence, max_length=500) hairpins = self.find_hairpins(sequence) # Calculate structure penalty for optimization # Penalize stable 5' structures (< -30 kcal/mol is problematic) five_prime_penalty = 0.0 if five_prime_mfe < -30: five_prime_penalty = (-30 - five_prime_mfe) / 10 # Scaled penalty # Penalize multiple hairpins hairpin_penalty = len(hairpins) * 0.1 # Total structure penalty structure_penalty = five_prime_penalty + hairpin_penalty return { 'available': True, 'full_mfe': full_mfe, 'five_prime_mfe': five_prime_mfe, 'hairpin_count': len(hairpins), 'hairpins': hairpins, 'structure_penalty': structure_penalty, } class OutlierIndex: """ Outlier Index (OI) quantifies negative sequence features that should be minimized. OI = sum(w_i * f_i(x)) for various adverse features Features include: - GC content deviation from optimal range (organism-specific) - Repetitive sequences - Rare codon clusters - mRNA secondary structure (optional - expensive to compute) - Adverse motifs (restriction sites, splice sites, etc.) """ # Weights for different features - GC penalties significantly increased WEIGHTS = { 'gc_content': 8.0, # Increased from 2.0 - global GC penalty 'gc_local': 4.0, # Increased from 1.5 - local window GC penalty 'gc_extreme': 10.0, # NEW - exponential penalty for extreme GC 'rare_codons': 2.0, 'repeats': 1.0, 'homopolymers': 1.5, 'motifs': 2.0, 'mrna_structure': 2.5, } # Organism-specific optimal GC content ranges # Based on typical GC content of highly-expressed genes GC_TARGETS = { "Escherichia coli K12": (0.45, 0.55), # E. coli prefers moderate GC "Homo sapiens": (0.45, 0.60), # Human - moderate to slightly high "CHO (Chinese Hamster Ovary)": (0.45, 0.60), "Saccharomyces cerevisiae": (0.35, 0.45), # Yeast prefers lower GC "Mus musculus": (0.45, 0.60), # Mouse similar to human "Insect (Spodoptera frugiperda)": (0.40, 0.55), "Pichia pastoris": (0.38, 0.48), "Bacillus subtilis": (0.40, 0.50), "Drosophila melanogaster": (0.45, 0.58), "Caenorhabditis elegans": (0.35, 0.45), } DEFAULT_GC_TARGET = (0.40, 0.55) # Default if organism not found # Adverse motifs to avoid ADVERSE_MOTIFS = [ # Shine-Dalgarno sequences (prokaryotic RBS) 'AGGAGG', 'GGAGG', 'AGGAG', 'GAGG', # Common restriction sites 'GAATTC', # EcoRI 'GGATCC', # BamHI 'AAGCTT', # HindIII 'CTGCAG', # PstI 'GCGGCCGC', # NotI 'CATATG', # NdeI # Splice site consensus 'GGTAAG', 'GGTGAG', # Poly-A signals 'AATAAA', 'ATTAAA', # AU-rich elements 'ATTTA', ] def __init__(self, organism: str, excluded_sites: List[str] = None, include_mrna_structure: bool = False): """ Initialize OutlierIndex. Args: organism: Target host organism excluded_sites: Restriction sites to avoid include_mrna_structure: If True, include mRNA folding in calculations (expensive - set False for optimization iterations) """ self.organism = organism self.codon_table = get_codon_table(organism) self.rare_threshold = 0.10 # Codons with frequency < 10% considered rare self.excluded_sites = excluded_sites or [] self.include_mrna_structure = include_mrna_structure self.mrna_analyzer = mRNAStructureAnalyzer() if include_mrna_structure else None # Get organism-specific GC target range self.gc_target = self.GC_TARGETS.get(organism, self.DEFAULT_GC_TARGET) self.gc_min, self.gc_max = self.gc_target def _gc_content_penalty(self, sequence: str) -> float: """Calculate penalty for GC content outside organism-specific optimal range.""" if len(sequence) == 0: return 0.0 gc_count = sequence.count('G') + sequence.count('C') gc_percent = gc_count / len(sequence) # Use organism-specific optimal range if self.gc_min <= gc_percent <= self.gc_max: return 0.0 elif gc_percent < self.gc_min: return (self.gc_min - gc_percent) * 3 # Increased multiplier else: return (gc_percent - self.gc_max) * 3 # Increased multiplier def _gc_extreme_penalty(self, sequence: str) -> float: """ Calculate exponential penalty for extreme GC content. Heavily penalizes GC outside 30-70% range. """ if len(sequence) == 0: return 0.0 gc_count = sequence.count('G') + sequence.count('C') gc_percent = gc_count / len(sequence) # No penalty within acceptable range (organism target ± 15%) soft_min = max(0.30, self.gc_min - 0.10) soft_max = min(0.70, self.gc_max + 0.10) if soft_min <= gc_percent <= soft_max: return 0.0 # Exponential penalty outside soft range if gc_percent < soft_min: deviation = soft_min - gc_percent else: deviation = gc_percent - soft_max # Exponential: penalty grows rapidly with deviation return (deviation * 10) ** 2 def _gc_local_penalty(self, sequence: str, window_size: int = 60) -> float: """Calculate penalty for local GC content outside acceptable range.""" if len(sequence) < window_size: return 0.0 # Local range is organism target ± 10% local_min = max(0.25, self.gc_min - 0.10) local_max = min(0.75, self.gc_max + 0.10) penalties = [] for i in range(0, len(sequence) - window_size + 1, 10): window = sequence[i:i + window_size] gc_count = window.count('G') + window.count('C') gc_percent = gc_count / len(window) if gc_percent < local_min: penalties.append((local_min - gc_percent) * 2) elif gc_percent > local_max: penalties.append((gc_percent - local_max) * 2) return sum(penalties) / max(len(penalties), 1) if penalties else 0.0 def _rare_codon_penalty(self, codons: List[str]) -> float: """Calculate penalty for rare codon usage.""" if len(codons) == 0: return 0.0 rare_count = 0 for codon in codons: freq = self.codon_table.get(codon, 0) if freq < self.rare_threshold: rare_count += 1 return rare_count / len(codons) def _repeat_penalty(self, sequence: str) -> float: """Calculate penalty for direct repeats (>= 8bp).""" penalty = 0.0 # Check for repeats for length in range(8, min(20, len(sequence) // 2)): for i in range(len(sequence) - 2 * length + 1): pattern = sequence[i:i + length] if pattern in sequence[i + length:]: penalty += 0.1 return min(penalty, 1.0) def _homopolymer_penalty(self, sequence: str) -> float: """Calculate penalty for homopolymer runs (>= 5 same nucleotides).""" penalty = 0.0 for base in 'ATGC': runs = re.findall(f'{base}{{5,}}', sequence) for run in runs: penalty += (len(run) - 4) * 0.1 return min(penalty, 1.0) def _motif_penalty(self, sequence: str) -> float: """Calculate penalty for adverse motifs.""" penalty = 0.0 all_motifs = self.ADVERSE_MOTIFS + self.excluded_sites for motif in all_motifs: count = sequence.count(motif) penalty += count * 0.2 return min(penalty, 1.0) def _mrna_structure_penalty(self, sequence: str) -> float: """Calculate penalty for problematic mRNA secondary structures.""" if not self.include_mrna_structure or self.mrna_analyzer is None: return 0.0 if not self.mrna_analyzer.available: return 0.0 analysis = self.mrna_analyzer.analyze(sequence) return analysis['structure_penalty'] def calculate(self, codons: List[str]) -> float: """ Calculate Outlier Index for a codon sequence. Returns value >= 0, where 0 is optimal (no adverse features). Lower is better. """ sequence = ''.join(codons) penalties = { 'gc_content': self._gc_content_penalty(sequence), 'gc_extreme': self._gc_extreme_penalty(sequence), 'gc_local': self._gc_local_penalty(sequence), 'rare_codons': self._rare_codon_penalty(codons), 'repeats': self._repeat_penalty(sequence), 'homopolymers': self._homopolymer_penalty(sequence), 'motifs': self._motif_penalty(sequence), 'mrna_structure': self._mrna_structure_penalty(sequence), } total = sum(self.WEIGHTS[k] * v for k, v in penalties.items()) return total def calculate_detailed(self, codons: List[str]) -> dict: """ Calculate Outlier Index with detailed breakdown. Returns dictionary with all penalty components and mRNA structure analysis. """ sequence = ''.join(codons) penalties = { 'gc_content': self._gc_content_penalty(sequence), 'gc_extreme': self._gc_extreme_penalty(sequence), 'gc_local': self._gc_local_penalty(sequence), 'rare_codons': self._rare_codon_penalty(codons), 'repeats': self._repeat_penalty(sequence), 'homopolymers': self._homopolymer_penalty(sequence), 'motifs': self._motif_penalty(sequence), 'mrna_structure': self._mrna_structure_penalty(sequence), } total = sum(self.WEIGHTS[k] * v for k, v in penalties.items()) # Get detailed mRNA analysis mrna_analysis = self.mrna_analyzer.analyze(sequence) return { 'total': total, 'penalties': penalties, 'weights': self.WEIGHTS, 'mrna_structure': mrna_analysis, } def calculate_cai(codons: List[str], codon_table: dict) -> float: """ Calculate Codon Adaptation Index (CAI). CAI is the geometric mean of relative adaptiveness values. """ if len(codons) == 0: return 0.0 # Calculate relative adaptiveness for each codon w_values = [] for codon in codons: aa = CODON_TO_AA.get(codon) if aa is None or aa == '*': continue # Get max frequency among synonymous codons synonymous = AA_TO_CODONS[aa] max_freq = max(codon_table.get(c, 0.01) for c in synonymous) # Relative adaptiveness freq = codon_table.get(codon, 0.01) w = freq / max_freq if max_freq > 0 else 0.01 w_values.append(max(w, 0.01)) # Avoid log(0) if len(w_values) == 0: return 0.0 # Geometric mean log_sum = sum(math.log(w) for w in w_values) cai = math.exp(log_sum / len(w_values)) return cai def calculate_gc_content(sequence: str) -> float: """Calculate GC content as percentage.""" if len(sequence) == 0: return 0.0 gc_count = sequence.count('G') + sequence.count('C') return (gc_count / len(sequence)) * 100