Spaces:
Sleeping
Sleeping
| """ | |
| Optimization indices based on GenScript patent WO2020024917A1. | |
| Implements Harmony Index, Codon Context Index, and Outlier Index. | |
| Includes mRNA secondary structure prediction using seqfold. | |
| """ | |
| import math | |
| import re | |
| from collections import Counter | |
| from typing import Dict, List, Tuple, Optional | |
| from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table | |
| # Try to import seqfold for mRNA structure prediction | |
| try: | |
| from seqfold import dg, fold, dot_bracket | |
| SEQFOLD_AVAILABLE = True | |
| except ImportError: | |
| SEQFOLD_AVAILABLE = False | |
| def cosine_distance(v1: List[float], v2: List[float]) -> float: | |
| """Calculate cosine distance between two vectors.""" | |
| dot_product = sum(a * b for a, b in zip(v1, v2)) | |
| norm1 = math.sqrt(sum(a * a for a in v1)) | |
| norm2 = math.sqrt(sum(b * b for b in v2)) | |
| if norm1 == 0 or norm2 == 0: | |
| return 1.0 | |
| similarity = dot_product / (norm1 * norm2) | |
| return 1.0 - similarity | |
| def euclidean_distance(v1: List[float], v2: List[float]) -> float: | |
| """Calculate normalized Euclidean distance between two vectors.""" | |
| if len(v1) != len(v2): | |
| raise ValueError("Vectors must have same length") | |
| squared_diff = sum((a - b) ** 2 for a, b in zip(v1, v2)) | |
| max_dist = math.sqrt(len(v1)) # Max possible distance when all elements differ by 1 | |
| return math.sqrt(squared_diff) / max_dist if max_dist > 0 else 0 | |
| def sequence_to_codons(dna_sequence: str) -> List[str]: | |
| """Convert DNA sequence to list of codons.""" | |
| dna_sequence = dna_sequence.upper().replace(" ", "").replace("\n", "") | |
| codons = [] | |
| for i in range(0, len(dna_sequence) - 2, 3): | |
| codons.append(dna_sequence[i:i+3]) | |
| return codons | |
| def codons_to_protein(codons: List[str]) -> str: | |
| """Convert codon list to protein sequence.""" | |
| protein = "" | |
| for codon in codons: | |
| aa = CODON_TO_AA.get(codon, 'X') | |
| if aa != '*': | |
| protein += aa | |
| return protein | |
| def protein_to_codons_random(protein: str, codon_table: dict) -> List[str]: | |
| """Convert protein to codons using weighted random selection based on usage table.""" | |
| import random | |
| codons = [] | |
| for aa in protein.upper(): | |
| if aa not in AA_TO_CODONS: | |
| continue | |
| synonymous = AA_TO_CODONS[aa] | |
| # Weight by codon usage | |
| weights = [codon_table.get(c, 0.1) for c in synonymous] | |
| total = sum(weights) | |
| weights = [w / total for w in weights] | |
| codon = random.choices(synonymous, weights=weights, k=1)[0] | |
| codons.append(codon) | |
| return codons | |
| class HarmonyIndex: | |
| """ | |
| Harmony Index (H) measures how well the codon usage in the candidate sequence | |
| matches the codon usage distribution of highly-expressed genes in the host. | |
| H = 1 - D(F_hs, F_ts) | |
| Where: | |
| - F_hs = codon frequency vector from highly-expressed genes (reference) | |
| - F_ts = codon frequency vector from target sequence | |
| - D = distance function (cosine or euclidean) | |
| """ | |
| def __init__(self, organism: str): | |
| self.codon_table = get_codon_table(organism) | |
| self.reference_freqs = self._build_reference_vector() | |
| def _build_reference_vector(self) -> List[float]: | |
| """Build reference frequency vector from codon table (59 codons, excluding stops and single-codon AAs).""" | |
| # All codons except stop codons | |
| codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*'] | |
| return [self.codon_table.get(c, 0) for c in sorted(codons)] | |
| def _build_sequence_vector(self, codons: List[str]) -> List[float]: | |
| """Build frequency vector from a codon sequence.""" | |
| # Count codons | |
| codon_counts = Counter(codons) | |
| total = len(codons) | |
| if total == 0: | |
| return [0.0] * 61 # 64 - 3 stop codons | |
| # Build vector matching reference order (all codons except stops) | |
| all_codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*'] | |
| # Calculate frequencies per amino acid group | |
| freqs = [] | |
| for codon in sorted(all_codons): | |
| aa = CODON_TO_AA[codon] | |
| synonymous = AA_TO_CODONS[aa] | |
| aa_total = sum(codon_counts.get(c, 0) for c in synonymous) | |
| if aa_total > 0: | |
| freq = codon_counts.get(codon, 0) / aa_total | |
| else: | |
| freq = 0.0 | |
| freqs.append(freq) | |
| return freqs | |
| def calculate(self, codons: List[str]) -> float: | |
| """ | |
| Calculate Harmony Index for a codon sequence. | |
| Returns value between 0 and 1, where 1 is perfect harmony. | |
| """ | |
| target_freqs = self._build_sequence_vector(codons) | |
| distance = cosine_distance(self.reference_freqs, target_freqs) | |
| return 1.0 - distance | |
| class CodonContextIndex: | |
| """ | |
| Codon Context Index (CC) evaluates how well codon pairs in the candidate | |
| sequence match the codon pair preferences of highly-expressed genes. | |
| CC = 1 - D(F_hcc, F_tcc) | |
| Where: | |
| - F_hcc = codon-pair frequency vector from reference | |
| - F_tcc = codon-pair frequency vector from target sequence | |
| """ | |
| def __init__(self, organism: str): | |
| self.organism = organism | |
| self.codon_table = get_codon_table(organism) | |
| # For simplicity, we use a basic model based on individual codon preferences | |
| # A full implementation would use experimental codon pair frequency data | |
| def _calculate_pair_score(self, codon1: str, codon2: str) -> float: | |
| """Calculate preference score for a codon pair based on individual codon usage.""" | |
| freq1 = self.codon_table.get(codon1, 0.1) | |
| freq2 = self.codon_table.get(codon2, 0.1) | |
| # Geometric mean of frequencies as a simple pair score | |
| return math.sqrt(freq1 * freq2) | |
| def calculate(self, codons: List[str]) -> float: | |
| """ | |
| Calculate Codon Context Index. | |
| Returns value between 0 and 1, where 1 is optimal context. | |
| """ | |
| if len(codons) < 2: | |
| return 1.0 | |
| total_score = 0.0 | |
| max_score = 0.0 | |
| for i in range(len(codons) - 1): | |
| pair_score = self._calculate_pair_score(codons[i], codons[i + 1]) | |
| total_score += pair_score | |
| max_score += 1.0 # Maximum possible score per pair | |
| if max_score == 0: | |
| return 1.0 | |
| return total_score / max_score | |
| # ============================================================================= | |
| # mRNA Secondary Structure Analysis | |
| # ============================================================================= | |
| class mRNAStructureAnalyzer: | |
| """ | |
| Analyzes mRNA secondary structure using seqfold. | |
| Key metrics: | |
| - 5' end folding energy (critical for translation initiation) | |
| - Full sequence minimum free energy (MFE) | |
| - Hairpin detection in sliding windows | |
| """ | |
| def __init__(self, temperature: float = 37.0): | |
| """ | |
| Initialize analyzer. | |
| Args: | |
| temperature: Temperature in Celsius for folding calculations | |
| """ | |
| self.temperature = temperature | |
| self.available = SEQFOLD_AVAILABLE | |
| def dna_to_rna(self, dna_seq: str) -> str: | |
| """Convert DNA sequence to RNA (T -> U).""" | |
| return dna_seq.upper().replace('T', 'U') | |
| def calculate_mfe(self, sequence: str, is_rna: bool = False, max_length: int = 200) -> float: | |
| """ | |
| Calculate minimum free energy (MFE) for a sequence. | |
| Args: | |
| sequence: DNA or RNA sequence | |
| is_rna: If True, sequence is already RNA | |
| max_length: Maximum sequence length to analyze (longer sequences are truncated) | |
| Returns: | |
| MFE in kcal/mol (more negative = more stable structure) | |
| """ | |
| if not self.available: | |
| return 0.0 | |
| if not is_rna: | |
| sequence = self.dna_to_rna(sequence) | |
| if len(sequence) < 4: | |
| return 0.0 | |
| # Truncate long sequences for performance (RNA folding is O(n³)) | |
| if len(sequence) > max_length: | |
| sequence = sequence[:max_length] | |
| try: | |
| return dg(sequence, temp=self.temperature) | |
| except Exception: | |
| return 0.0 | |
| def calculate_5prime_mfe(self, sequence: str, window_size: int = 50) -> float: | |
| """ | |
| Calculate MFE for the 5' region (critical for translation initiation). | |
| Stable structures at the 5' end can reduce translation by up to 250-fold. | |
| Args: | |
| sequence: DNA sequence | |
| window_size: Number of nucleotides from 5' end to analyze | |
| Returns: | |
| MFE of 5' region in kcal/mol | |
| """ | |
| if len(sequence) < window_size: | |
| return self.calculate_mfe(sequence) | |
| return self.calculate_mfe(sequence[:window_size]) | |
| def find_hairpins(self, sequence: str, window_size: int = 60, | |
| step: int = 30, threshold: float = -15.0) -> List[dict]: | |
| """ | |
| Find potential hairpin structures using sliding window analysis. | |
| Args: | |
| sequence: DNA sequence | |
| window_size: Size of sliding window | |
| step: Step size for window | |
| threshold: MFE threshold for flagging (kcal/mol) | |
| Returns: | |
| List of hairpin regions with position and energy | |
| """ | |
| if not self.available: | |
| return [] | |
| hairpins = [] | |
| rna_seq = self.dna_to_rna(sequence) | |
| for i in range(0, len(rna_seq) - window_size + 1, step): | |
| window = rna_seq[i:i + window_size] | |
| try: | |
| energy = dg(window, temp=self.temperature) | |
| if energy < threshold: | |
| hairpins.append({ | |
| 'start': i, | |
| 'end': i + window_size, | |
| 'energy': energy, | |
| 'sequence': sequence[i:i + window_size] | |
| }) | |
| except Exception: | |
| continue | |
| return hairpins | |
| def get_structure_string(self, sequence: str) -> str: | |
| """ | |
| Get dot-bracket notation of predicted structure. | |
| Args: | |
| sequence: DNA sequence | |
| Returns: | |
| Dot-bracket string (e.g., "(((...)))") | |
| """ | |
| if not self.available: | |
| return "" | |
| rna_seq = self.dna_to_rna(sequence) | |
| try: | |
| structs = fold(rna_seq) | |
| return dot_bracket(structs) | |
| except Exception: | |
| return "" | |
| def analyze(self, sequence: str, fast_mode: bool = True) -> dict: | |
| """ | |
| Perform mRNA structure analysis. | |
| Args: | |
| sequence: DNA sequence | |
| fast_mode: If True, only analyze critical regions (faster) | |
| Returns: | |
| Dictionary with structure metrics | |
| """ | |
| if not self.available: | |
| return { | |
| 'available': False, | |
| 'full_mfe': 0.0, | |
| 'five_prime_mfe': 0.0, | |
| 'hairpin_count': 0, | |
| 'hairpins': [], | |
| 'structure_penalty': 0.0, | |
| } | |
| # Always calculate 5' MFE (most important for translation) | |
| five_prime_mfe = self.calculate_5prime_mfe(sequence) | |
| # For full MFE, only analyze first 200 nt in fast mode | |
| if fast_mode: | |
| full_mfe = self.calculate_mfe(sequence[:min(200, len(sequence))]) | |
| # Skip hairpin detection in fast mode for long sequences | |
| if len(sequence) > 300: | |
| hairpins = [] | |
| else: | |
| hairpins = self.find_hairpins(sequence, window_size=40, step=40, threshold=-12.0) | |
| else: | |
| full_mfe = self.calculate_mfe(sequence, max_length=500) | |
| hairpins = self.find_hairpins(sequence) | |
| # Calculate structure penalty for optimization | |
| # Penalize stable 5' structures (< -30 kcal/mol is problematic) | |
| five_prime_penalty = 0.0 | |
| if five_prime_mfe < -30: | |
| five_prime_penalty = (-30 - five_prime_mfe) / 10 # Scaled penalty | |
| # Penalize multiple hairpins | |
| hairpin_penalty = len(hairpins) * 0.1 | |
| # Total structure penalty | |
| structure_penalty = five_prime_penalty + hairpin_penalty | |
| return { | |
| 'available': True, | |
| 'full_mfe': full_mfe, | |
| 'five_prime_mfe': five_prime_mfe, | |
| 'hairpin_count': len(hairpins), | |
| 'hairpins': hairpins, | |
| 'structure_penalty': structure_penalty, | |
| } | |
| class OutlierIndex: | |
| """ | |
| Outlier Index (OI) quantifies negative sequence features that should be minimized. | |
| OI = sum(w_i * f_i(x)) for various adverse features | |
| Features include: | |
| - GC content deviation from optimal range (organism-specific) | |
| - Repetitive sequences | |
| - Rare codon clusters | |
| - mRNA secondary structure (optional - expensive to compute) | |
| - Adverse motifs (restriction sites, splice sites, etc.) | |
| """ | |
| # Weights for different features - GC penalties significantly increased | |
| WEIGHTS = { | |
| 'gc_content': 8.0, # Increased from 2.0 - global GC penalty | |
| 'gc_local': 4.0, # Increased from 1.5 - local window GC penalty | |
| 'gc_extreme': 10.0, # NEW - exponential penalty for extreme GC | |
| 'rare_codons': 2.0, | |
| 'repeats': 1.0, | |
| 'homopolymers': 1.5, | |
| 'motifs': 2.0, | |
| 'mrna_structure': 2.5, | |
| } | |
| # Organism-specific optimal GC content ranges | |
| # Based on typical GC content of highly-expressed genes | |
| GC_TARGETS = { | |
| "Escherichia coli K12": (0.45, 0.55), # E. coli prefers moderate GC | |
| "Homo sapiens": (0.45, 0.60), # Human - moderate to slightly high | |
| "CHO (Chinese Hamster Ovary)": (0.45, 0.60), | |
| "Saccharomyces cerevisiae": (0.35, 0.45), # Yeast prefers lower GC | |
| "Mus musculus": (0.45, 0.60), # Mouse similar to human | |
| "Insect (Spodoptera frugiperda)": (0.40, 0.55), | |
| "Pichia pastoris": (0.38, 0.48), | |
| "Bacillus subtilis": (0.40, 0.50), | |
| "Drosophila melanogaster": (0.45, 0.58), | |
| "Caenorhabditis elegans": (0.35, 0.45), | |
| } | |
| DEFAULT_GC_TARGET = (0.40, 0.55) # Default if organism not found | |
| # Adverse motifs to avoid | |
| ADVERSE_MOTIFS = [ | |
| # Shine-Dalgarno sequences (prokaryotic RBS) | |
| 'AGGAGG', 'GGAGG', 'AGGAG', 'GAGG', | |
| # Common restriction sites | |
| 'GAATTC', # EcoRI | |
| 'GGATCC', # BamHI | |
| 'AAGCTT', # HindIII | |
| 'CTGCAG', # PstI | |
| 'GCGGCCGC', # NotI | |
| 'CATATG', # NdeI | |
| # Splice site consensus | |
| 'GGTAAG', 'GGTGAG', | |
| # Poly-A signals | |
| 'AATAAA', 'ATTAAA', | |
| # AU-rich elements | |
| 'ATTTA', | |
| ] | |
| def __init__(self, organism: str, excluded_sites: List[str] = None, | |
| include_mrna_structure: bool = False): | |
| """ | |
| Initialize OutlierIndex. | |
| Args: | |
| organism: Target host organism | |
| excluded_sites: Restriction sites to avoid | |
| include_mrna_structure: If True, include mRNA folding in calculations | |
| (expensive - set False for optimization iterations) | |
| """ | |
| self.organism = organism | |
| self.codon_table = get_codon_table(organism) | |
| self.rare_threshold = 0.10 # Codons with frequency < 10% considered rare | |
| self.excluded_sites = excluded_sites or [] | |
| self.include_mrna_structure = include_mrna_structure | |
| self.mrna_analyzer = mRNAStructureAnalyzer() if include_mrna_structure else None | |
| # Get organism-specific GC target range | |
| self.gc_target = self.GC_TARGETS.get(organism, self.DEFAULT_GC_TARGET) | |
| self.gc_min, self.gc_max = self.gc_target | |
| def _gc_content_penalty(self, sequence: str) -> float: | |
| """Calculate penalty for GC content outside organism-specific optimal range.""" | |
| if len(sequence) == 0: | |
| return 0.0 | |
| gc_count = sequence.count('G') + sequence.count('C') | |
| gc_percent = gc_count / len(sequence) | |
| # Use organism-specific optimal range | |
| if self.gc_min <= gc_percent <= self.gc_max: | |
| return 0.0 | |
| elif gc_percent < self.gc_min: | |
| return (self.gc_min - gc_percent) * 3 # Increased multiplier | |
| else: | |
| return (gc_percent - self.gc_max) * 3 # Increased multiplier | |
| def _gc_extreme_penalty(self, sequence: str) -> float: | |
| """ | |
| Calculate exponential penalty for extreme GC content. | |
| Heavily penalizes GC outside 30-70% range. | |
| """ | |
| if len(sequence) == 0: | |
| return 0.0 | |
| gc_count = sequence.count('G') + sequence.count('C') | |
| gc_percent = gc_count / len(sequence) | |
| # No penalty within acceptable range (organism target ± 15%) | |
| soft_min = max(0.30, self.gc_min - 0.10) | |
| soft_max = min(0.70, self.gc_max + 0.10) | |
| if soft_min <= gc_percent <= soft_max: | |
| return 0.0 | |
| # Exponential penalty outside soft range | |
| if gc_percent < soft_min: | |
| deviation = soft_min - gc_percent | |
| else: | |
| deviation = gc_percent - soft_max | |
| # Exponential: penalty grows rapidly with deviation | |
| return (deviation * 10) ** 2 | |
| def _gc_local_penalty(self, sequence: str, window_size: int = 60) -> float: | |
| """Calculate penalty for local GC content outside acceptable range.""" | |
| if len(sequence) < window_size: | |
| return 0.0 | |
| # Local range is organism target ± 10% | |
| local_min = max(0.25, self.gc_min - 0.10) | |
| local_max = min(0.75, self.gc_max + 0.10) | |
| penalties = [] | |
| for i in range(0, len(sequence) - window_size + 1, 10): | |
| window = sequence[i:i + window_size] | |
| gc_count = window.count('G') + window.count('C') | |
| gc_percent = gc_count / len(window) | |
| if gc_percent < local_min: | |
| penalties.append((local_min - gc_percent) * 2) | |
| elif gc_percent > local_max: | |
| penalties.append((gc_percent - local_max) * 2) | |
| return sum(penalties) / max(len(penalties), 1) if penalties else 0.0 | |
| def _rare_codon_penalty(self, codons: List[str]) -> float: | |
| """Calculate penalty for rare codon usage.""" | |
| if len(codons) == 0: | |
| return 0.0 | |
| rare_count = 0 | |
| for codon in codons: | |
| freq = self.codon_table.get(codon, 0) | |
| if freq < self.rare_threshold: | |
| rare_count += 1 | |
| return rare_count / len(codons) | |
| def _repeat_penalty(self, sequence: str) -> float: | |
| """Calculate penalty for direct repeats (>= 8bp).""" | |
| penalty = 0.0 | |
| # Check for repeats | |
| for length in range(8, min(20, len(sequence) // 2)): | |
| for i in range(len(sequence) - 2 * length + 1): | |
| pattern = sequence[i:i + length] | |
| if pattern in sequence[i + length:]: | |
| penalty += 0.1 | |
| return min(penalty, 1.0) | |
| def _homopolymer_penalty(self, sequence: str) -> float: | |
| """Calculate penalty for homopolymer runs (>= 5 same nucleotides).""" | |
| penalty = 0.0 | |
| for base in 'ATGC': | |
| runs = re.findall(f'{base}{{5,}}', sequence) | |
| for run in runs: | |
| penalty += (len(run) - 4) * 0.1 | |
| return min(penalty, 1.0) | |
| def _motif_penalty(self, sequence: str) -> float: | |
| """Calculate penalty for adverse motifs.""" | |
| penalty = 0.0 | |
| all_motifs = self.ADVERSE_MOTIFS + self.excluded_sites | |
| for motif in all_motifs: | |
| count = sequence.count(motif) | |
| penalty += count * 0.2 | |
| return min(penalty, 1.0) | |
| def _mrna_structure_penalty(self, sequence: str) -> float: | |
| """Calculate penalty for problematic mRNA secondary structures.""" | |
| if not self.include_mrna_structure or self.mrna_analyzer is None: | |
| return 0.0 | |
| if not self.mrna_analyzer.available: | |
| return 0.0 | |
| analysis = self.mrna_analyzer.analyze(sequence) | |
| return analysis['structure_penalty'] | |
| def calculate(self, codons: List[str]) -> float: | |
| """ | |
| Calculate Outlier Index for a codon sequence. | |
| Returns value >= 0, where 0 is optimal (no adverse features). | |
| Lower is better. | |
| """ | |
| sequence = ''.join(codons) | |
| penalties = { | |
| 'gc_content': self._gc_content_penalty(sequence), | |
| 'gc_extreme': self._gc_extreme_penalty(sequence), | |
| 'gc_local': self._gc_local_penalty(sequence), | |
| 'rare_codons': self._rare_codon_penalty(codons), | |
| 'repeats': self._repeat_penalty(sequence), | |
| 'homopolymers': self._homopolymer_penalty(sequence), | |
| 'motifs': self._motif_penalty(sequence), | |
| 'mrna_structure': self._mrna_structure_penalty(sequence), | |
| } | |
| total = sum(self.WEIGHTS[k] * v for k, v in penalties.items()) | |
| return total | |
| def calculate_detailed(self, codons: List[str]) -> dict: | |
| """ | |
| Calculate Outlier Index with detailed breakdown. | |
| Returns dictionary with all penalty components and mRNA structure analysis. | |
| """ | |
| sequence = ''.join(codons) | |
| penalties = { | |
| 'gc_content': self._gc_content_penalty(sequence), | |
| 'gc_extreme': self._gc_extreme_penalty(sequence), | |
| 'gc_local': self._gc_local_penalty(sequence), | |
| 'rare_codons': self._rare_codon_penalty(codons), | |
| 'repeats': self._repeat_penalty(sequence), | |
| 'homopolymers': self._homopolymer_penalty(sequence), | |
| 'motifs': self._motif_penalty(sequence), | |
| 'mrna_structure': self._mrna_structure_penalty(sequence), | |
| } | |
| total = sum(self.WEIGHTS[k] * v for k, v in penalties.items()) | |
| # Get detailed mRNA analysis | |
| mrna_analysis = self.mrna_analyzer.analyze(sequence) | |
| return { | |
| 'total': total, | |
| 'penalties': penalties, | |
| 'weights': self.WEIGHTS, | |
| 'mrna_structure': mrna_analysis, | |
| } | |
| def calculate_cai(codons: List[str], codon_table: dict) -> float: | |
| """ | |
| Calculate Codon Adaptation Index (CAI). | |
| CAI is the geometric mean of relative adaptiveness values. | |
| """ | |
| if len(codons) == 0: | |
| return 0.0 | |
| # Calculate relative adaptiveness for each codon | |
| w_values = [] | |
| for codon in codons: | |
| aa = CODON_TO_AA.get(codon) | |
| if aa is None or aa == '*': | |
| continue | |
| # Get max frequency among synonymous codons | |
| synonymous = AA_TO_CODONS[aa] | |
| max_freq = max(codon_table.get(c, 0.01) for c in synonymous) | |
| # Relative adaptiveness | |
| freq = codon_table.get(codon, 0.01) | |
| w = freq / max_freq if max_freq > 0 else 0.01 | |
| w_values.append(max(w, 0.01)) # Avoid log(0) | |
| if len(w_values) == 0: | |
| return 0.0 | |
| # Geometric mean | |
| log_sum = sum(math.log(w) for w in w_values) | |
| cai = math.exp(log_sum / len(w_values)) | |
| return cai | |
| def calculate_gc_content(sequence: str) -> float: | |
| """Calculate GC content as percentage.""" | |
| if len(sequence) == 0: | |
| return 0.0 | |
| gc_count = sequence.count('G') + sequence.count('C') | |
| return (gc_count / len(sequence)) * 100 | |