""" NSGA-III Multi-objective Codon Optimization Engine Based on GenScript patent WO2020024917A1. Uses pymoo for the NSGA-III algorithm implementation. """ import numpy as np import random from typing import List, Tuple, Optional from pymoo.core.problem import Problem from pymoo.algorithms.moo.nsga3 import NSGA3 from pymoo.util.ref_dirs import get_reference_directions from pymoo.optimize import minimize from pymoo.operators.crossover.sbx import SBX from pymoo.operators.mutation.pm import PM from pymoo.operators.sampling.rnd import FloatRandomSampling from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table, get_organism_list from indices import ( HarmonyIndex, CodonContextIndex, OutlierIndex, calculate_cai, calculate_gc_content, sequence_to_codons, codons_to_protein, protein_to_codons_random ) class CodonOptimizationProblem(Problem): """ Multi-objective optimization problem for codon optimization. Objectives: 1. Maximize Harmony Index (minimize negative) 2. Maximize Codon Context Index (minimize negative) 3. Minimize Outlier Index Decision variables: Real values [0, 1) for each codon position, mapped to synonymous codon choices. """ def __init__(self, protein_sequence: str, organism: str, excluded_sites: List[str] = None): self.protein = protein_sequence.upper() self.organism = organism self.codon_table = get_codon_table(organism) self.excluded_sites = excluded_sites or [] # Build codon choices for each position self.codon_choices = [] for aa in self.protein: if aa in AA_TO_CODONS: self.codon_choices.append(AA_TO_CODONS[aa]) else: # Unknown amino acid - use most common self.codon_choices.append(['NNN']) n_vars = len(self.protein) # Initialize index calculators # Note: mRNA structure is disabled during optimization for performance # It will be calculated for the final result only self.harmony_idx = HarmonyIndex(organism) self.context_idx = CodonContextIndex(organism) self.outlier_idx = OutlierIndex(organism, excluded_sites, include_mrna_structure=False) super().__init__( n_var=n_vars, n_obj=3, xl=np.zeros(n_vars), xu=np.ones(n_vars), ) def decode_solution(self, x: np.ndarray) -> List[str]: """Convert real-valued solution to codon sequence.""" codons = [] for i, val in enumerate(x): choices = self.codon_choices[i] # Map [0, 1) to codon index idx = int(val * len(choices)) idx = min(idx, len(choices) - 1) # Ensure valid index codons.append(choices[idx]) return codons def _evaluate(self, x: np.ndarray, out: dict, *args, **kwargs): """Evaluate fitness for population.""" f = np.zeros((x.shape[0], 3)) for i in range(x.shape[0]): codons = self.decode_solution(x[i]) # Calculate objectives (minimize all, so negate maximization objectives) harmony = self.harmony_idx.calculate(codons) context = self.context_idx.calculate(codons) outlier = self.outlier_idx.calculate(codons) # Objectives: minimize -harmony, minimize -context, minimize outlier f[i, 0] = -harmony f[i, 1] = -context f[i, 2] = outlier out["F"] = f class CodonOptimizer: """ Main codon optimization class using NSGA-III algorithm. """ def __init__(self, organism: str = "Escherichia coli K12", excluded_sites: List[str] = None): self.organism = organism self.excluded_sites = excluded_sites or [] self.codon_table = get_codon_table(organism) def _validate_protein(self, sequence: str) -> str: """Validate and clean protein sequence.""" valid_aa = set('ACDEFGHIKLMNPQRSTVWY') cleaned = ''.join(c for c in sequence.upper() if c in valid_aa or c == '*') # Remove stop codons from internal positions if cleaned.endswith('*'): cleaned = cleaned[:-1] cleaned = cleaned.replace('*', '') return cleaned def _validate_dna(self, sequence: str) -> str: """Validate and clean DNA sequence.""" valid_bases = set('ATGC') cleaned = ''.join(c for c in sequence.upper() if c in valid_bases) return cleaned def optimize(self, sequence: str, is_protein: bool = True, pop_size: int = 100, n_gen: int = 100, verbose: bool = False) -> dict: """ Optimize a protein or DNA sequence. Args: sequence: Input protein or DNA sequence is_protein: True if input is protein, False if DNA pop_size: Population size for genetic algorithm n_gen: Number of generations verbose: Print progress Returns: Dictionary with optimized sequence and metrics """ # Parse input if is_protein: protein = self._validate_protein(sequence) else: dna = self._validate_dna(sequence) codons = sequence_to_codons(dna) protein = codons_to_protein(codons) if len(protein) == 0: raise ValueError("No valid amino acids found in sequence") if verbose: print(f"Optimizing {len(protein)} amino acids for {self.organism}") # Create optimization problem problem = CodonOptimizationProblem( protein, self.organism, self.excluded_sites ) # Configure NSGA-III ref_dirs = get_reference_directions("das-dennis", 3, n_partitions=12) algorithm = NSGA3( pop_size=pop_size, ref_dirs=ref_dirs, sampling=FloatRandomSampling(), crossover=SBX(prob=0.9, eta=15), mutation=PM(eta=20), eliminate_duplicates=True ) # Run optimization result = minimize( problem, algorithm, ('n_gen', n_gen), seed=42, verbose=verbose ) # Get best solution (best harmony index) best_idx = np.argmin(result.F[:, 0]) # Best harmony (most negative = highest) best_x = result.X[best_idx] best_codons = problem.decode_solution(best_x) best_dna = ''.join(best_codons) # Calculate final metrics harmony = problem.harmony_idx.calculate(best_codons) context = problem.context_idx.calculate(best_codons) outlier = problem.outlier_idx.calculate(best_codons) cai = calculate_cai(best_codons, self.codon_table) gc = calculate_gc_content(best_dna) # Get Pareto front solutions pareto_solutions = [] for i in range(len(result.X)): codons = problem.decode_solution(result.X[i]) pareto_solutions.append({ 'dna': ''.join(codons), 'harmony': -result.F[i, 0], 'context': -result.F[i, 1], 'outlier': result.F[i, 2], }) return { 'protein': protein, 'optimized_dna': best_dna, 'codons': best_codons, 'metrics': { 'harmony_index': harmony, 'context_index': context, 'outlier_index': outlier, 'cai': cai, 'gc_content': gc, 'length_bp': len(best_dna), 'length_aa': len(protein), }, 'pareto_front': pareto_solutions[:5], # Top 5 solutions 'organism': self.organism, } def quick_optimize(sequence: str, organism: str = "Escherichia coli K12", is_protein: bool = True, excluded_sites: List[str] = None, quality: str = "standard") -> dict: """ Quick optimization function with preset configurations. Args: sequence: Input sequence (protein or DNA) organism: Target expression host is_protein: True if protein sequence, False if DNA excluded_sites: Restriction sites to avoid quality: "fast", "standard", or "thorough" Returns: Optimization results dictionary """ # Quality presets - reduced for web app performance presets = { "fast": {"pop_size": 30, "n_gen": 20}, "standard": {"pop_size": 50, "n_gen": 40}, "thorough": {"pop_size": 80, "n_gen": 60}, } params = presets.get(quality, presets["standard"]) optimizer = CodonOptimizer(organism, excluded_sites) return optimizer.optimize( sequence, is_protein, pop_size=params["pop_size"], n_gen=params["n_gen"], verbose=False ) # Simple fallback optimizer for environments without pymoo class SimpleOptimizer: """ Simpler optimization using weighted random selection and hill climbing. Fallback when pymoo is not available. """ def __init__(self, organism: str = "Escherichia coli K12", excluded_sites: List[str] = None): self.organism = organism self.excluded_sites = excluded_sites or [] self.codon_table = get_codon_table(organism) def _validate_protein(self, sequence: str) -> str: valid_aa = set('ACDEFGHIKLMNPQRSTVWY') cleaned = ''.join(c for c in sequence.upper() if c in valid_aa) return cleaned def _validate_dna(self, sequence: str) -> str: valid_bases = set('ATGC') return ''.join(c for c in sequence.upper() if c in valid_bases) def _select_best_codon(self, aa: str) -> str: """Select the most preferred codon for an amino acid.""" if aa not in AA_TO_CODONS: return 'NNN' synonymous = AA_TO_CODONS[aa] best_codon = max(synonymous, key=lambda c: self.codon_table.get(c, 0)) return best_codon def _check_excluded_sites(self, dna: str) -> List[str]: """Check for excluded restriction sites.""" found = [] for site in self.excluded_sites: if site.upper() in dna: found.append(site) return found def optimize(self, sequence: str, is_protein: bool = True, iterations: int = 1000) -> dict: """ Optimize using greedy selection with local refinement. """ if is_protein: protein = self._validate_protein(sequence) else: dna = self._validate_dna(sequence) codons = sequence_to_codons(dna) protein = codons_to_protein(codons) if len(protein) == 0: raise ValueError("No valid amino acids found") # Initial solution: best codon for each position best_codons = [self._select_best_codon(aa) for aa in protein] # Initialize indices harmony_idx = HarmonyIndex(self.organism) context_idx = CodonContextIndex(self.organism) outlier_idx = OutlierIndex(self.organism, self.excluded_sites) def score(codons): h = harmony_idx.calculate(codons) c = context_idx.calculate(codons) o = outlier_idx.calculate(codons) return h + c - o # Higher is better best_score = score(best_codons) # Hill climbing with random restarts for _ in range(iterations): # Try a random mutation pos = random.randint(0, len(protein) - 1) aa = protein[pos] if aa not in AA_TO_CODONS: continue synonymous = AA_TO_CODONS[aa] if len(synonymous) <= 1: continue # Try alternative codon current_codon = best_codons[pos] alternatives = [c for c in synonymous if c != current_codon] new_codon = random.choice(alternatives) # Test new solution test_codons = best_codons.copy() test_codons[pos] = new_codon new_score = score(test_codons) # Check for excluded sites test_dna = ''.join(test_codons) has_excluded = any(site.upper() in test_dna for site in self.excluded_sites) if new_score > best_score and not has_excluded: best_codons = test_codons best_score = new_score # Calculate final metrics best_dna = ''.join(best_codons) harmony = harmony_idx.calculate(best_codons) context = context_idx.calculate(best_codons) outlier = outlier_idx.calculate(best_codons) cai = calculate_cai(best_codons, self.codon_table) gc = calculate_gc_content(best_dna) return { 'protein': protein, 'optimized_dna': best_dna, 'codons': best_codons, 'metrics': { 'harmony_index': harmony, 'context_index': context, 'outlier_index': outlier, 'cai': cai, 'gc_content': gc, 'length_bp': len(best_dna), 'length_aa': len(protein), }, 'organism': self.organism, } def optimize_sequence(sequence: str, organism: str = "Escherichia coli K12", is_protein: bool = True, excluded_sites: List[str] = None, use_nsga3: bool = True, quality: str = "standard") -> dict: """ Main entry point for codon optimization. Args: sequence: Input protein or DNA sequence organism: Target host organism is_protein: True if protein, False if DNA excluded_sites: Restriction sites to exclude use_nsga3: Use NSGA-III (requires pymoo) or simple optimizer quality: "fast", "standard", or "thorough" Returns: Optimization results """ if use_nsga3: try: return quick_optimize(sequence, organism, is_protein, excluded_sites, quality) except ImportError: print("pymoo not available, falling back to simple optimizer") use_nsga3 = False if not use_nsga3: iterations = {"fast": 1000, "standard": 3000, "thorough": 5000}.get(quality, 3000) optimizer = SimpleOptimizer(organism, excluded_sites) return optimizer.optimize(sequence, is_protein, iterations)