Spaces:
Sleeping
Sleeping
| """ | |
| NSGA-III Multi-objective Codon Optimization Engine | |
| Based on GenScript patent WO2020024917A1. | |
| Uses pymoo for the NSGA-III algorithm implementation. | |
| """ | |
| import numpy as np | |
| import random | |
| from typing import List, Tuple, Optional | |
| from pymoo.core.problem import Problem | |
| from pymoo.algorithms.moo.nsga3 import NSGA3 | |
| from pymoo.util.ref_dirs import get_reference_directions | |
| from pymoo.optimize import minimize | |
| from pymoo.operators.crossover.sbx import SBX | |
| from pymoo.operators.mutation.pm import PM | |
| from pymoo.operators.sampling.rnd import FloatRandomSampling | |
| from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table, get_organism_list | |
| from indices import ( | |
| HarmonyIndex, CodonContextIndex, OutlierIndex, | |
| calculate_cai, calculate_gc_content, sequence_to_codons, | |
| codons_to_protein, protein_to_codons_random | |
| ) | |
| class CodonOptimizationProblem(Problem): | |
| """ | |
| Multi-objective optimization problem for codon optimization. | |
| Objectives: | |
| 1. Maximize Harmony Index (minimize negative) | |
| 2. Maximize Codon Context Index (minimize negative) | |
| 3. Minimize Outlier Index | |
| Decision variables: Real values [0, 1) for each codon position, | |
| mapped to synonymous codon choices. | |
| """ | |
| def __init__(self, protein_sequence: str, organism: str, | |
| excluded_sites: List[str] = None): | |
| self.protein = protein_sequence.upper() | |
| self.organism = organism | |
| self.codon_table = get_codon_table(organism) | |
| self.excluded_sites = excluded_sites or [] | |
| # Build codon choices for each position | |
| self.codon_choices = [] | |
| for aa in self.protein: | |
| if aa in AA_TO_CODONS: | |
| self.codon_choices.append(AA_TO_CODONS[aa]) | |
| else: | |
| # Unknown amino acid - use most common | |
| self.codon_choices.append(['NNN']) | |
| n_vars = len(self.protein) | |
| # Initialize index calculators | |
| # Note: mRNA structure is disabled during optimization for performance | |
| # It will be calculated for the final result only | |
| self.harmony_idx = HarmonyIndex(organism) | |
| self.context_idx = CodonContextIndex(organism) | |
| self.outlier_idx = OutlierIndex(organism, excluded_sites, include_mrna_structure=False) | |
| super().__init__( | |
| n_var=n_vars, | |
| n_obj=3, | |
| xl=np.zeros(n_vars), | |
| xu=np.ones(n_vars), | |
| ) | |
| def decode_solution(self, x: np.ndarray) -> List[str]: | |
| """Convert real-valued solution to codon sequence.""" | |
| codons = [] | |
| for i, val in enumerate(x): | |
| choices = self.codon_choices[i] | |
| # Map [0, 1) to codon index | |
| idx = int(val * len(choices)) | |
| idx = min(idx, len(choices) - 1) # Ensure valid index | |
| codons.append(choices[idx]) | |
| return codons | |
| def _evaluate(self, x: np.ndarray, out: dict, *args, **kwargs): | |
| """Evaluate fitness for population.""" | |
| f = np.zeros((x.shape[0], 3)) | |
| for i in range(x.shape[0]): | |
| codons = self.decode_solution(x[i]) | |
| # Calculate objectives (minimize all, so negate maximization objectives) | |
| harmony = self.harmony_idx.calculate(codons) | |
| context = self.context_idx.calculate(codons) | |
| outlier = self.outlier_idx.calculate(codons) | |
| # Objectives: minimize -harmony, minimize -context, minimize outlier | |
| f[i, 0] = -harmony | |
| f[i, 1] = -context | |
| f[i, 2] = outlier | |
| out["F"] = f | |
| class CodonOptimizer: | |
| """ | |
| Main codon optimization class using NSGA-III algorithm. | |
| """ | |
| def __init__(self, organism: str = "Escherichia coli K12", | |
| excluded_sites: List[str] = None): | |
| self.organism = organism | |
| self.excluded_sites = excluded_sites or [] | |
| self.codon_table = get_codon_table(organism) | |
| def _validate_protein(self, sequence: str) -> str: | |
| """Validate and clean protein sequence.""" | |
| valid_aa = set('ACDEFGHIKLMNPQRSTVWY') | |
| cleaned = ''.join(c for c in sequence.upper() if c in valid_aa or c == '*') | |
| # Remove stop codons from internal positions | |
| if cleaned.endswith('*'): | |
| cleaned = cleaned[:-1] | |
| cleaned = cleaned.replace('*', '') | |
| return cleaned | |
| def _validate_dna(self, sequence: str) -> str: | |
| """Validate and clean DNA sequence.""" | |
| valid_bases = set('ATGC') | |
| cleaned = ''.join(c for c in sequence.upper() if c in valid_bases) | |
| return cleaned | |
| def optimize(self, sequence: str, is_protein: bool = True, | |
| pop_size: int = 100, n_gen: int = 100, | |
| verbose: bool = False) -> dict: | |
| """ | |
| Optimize a protein or DNA sequence. | |
| Args: | |
| sequence: Input protein or DNA sequence | |
| is_protein: True if input is protein, False if DNA | |
| pop_size: Population size for genetic algorithm | |
| n_gen: Number of generations | |
| verbose: Print progress | |
| Returns: | |
| Dictionary with optimized sequence and metrics | |
| """ | |
| # Parse input | |
| if is_protein: | |
| protein = self._validate_protein(sequence) | |
| else: | |
| dna = self._validate_dna(sequence) | |
| codons = sequence_to_codons(dna) | |
| protein = codons_to_protein(codons) | |
| if len(protein) == 0: | |
| raise ValueError("No valid amino acids found in sequence") | |
| if verbose: | |
| print(f"Optimizing {len(protein)} amino acids for {self.organism}") | |
| # Create optimization problem | |
| problem = CodonOptimizationProblem( | |
| protein, self.organism, self.excluded_sites | |
| ) | |
| # Configure NSGA-III | |
| ref_dirs = get_reference_directions("das-dennis", 3, n_partitions=12) | |
| algorithm = NSGA3( | |
| pop_size=pop_size, | |
| ref_dirs=ref_dirs, | |
| sampling=FloatRandomSampling(), | |
| crossover=SBX(prob=0.9, eta=15), | |
| mutation=PM(eta=20), | |
| eliminate_duplicates=True | |
| ) | |
| # Run optimization | |
| result = minimize( | |
| problem, | |
| algorithm, | |
| ('n_gen', n_gen), | |
| seed=42, | |
| verbose=verbose | |
| ) | |
| # Get best solution (best harmony index) | |
| best_idx = np.argmin(result.F[:, 0]) # Best harmony (most negative = highest) | |
| best_x = result.X[best_idx] | |
| best_codons = problem.decode_solution(best_x) | |
| best_dna = ''.join(best_codons) | |
| # Calculate final metrics | |
| harmony = problem.harmony_idx.calculate(best_codons) | |
| context = problem.context_idx.calculate(best_codons) | |
| outlier = problem.outlier_idx.calculate(best_codons) | |
| cai = calculate_cai(best_codons, self.codon_table) | |
| gc = calculate_gc_content(best_dna) | |
| # Get Pareto front solutions | |
| pareto_solutions = [] | |
| for i in range(len(result.X)): | |
| codons = problem.decode_solution(result.X[i]) | |
| pareto_solutions.append({ | |
| 'dna': ''.join(codons), | |
| 'harmony': -result.F[i, 0], | |
| 'context': -result.F[i, 1], | |
| 'outlier': result.F[i, 2], | |
| }) | |
| return { | |
| 'protein': protein, | |
| 'optimized_dna': best_dna, | |
| 'codons': best_codons, | |
| 'metrics': { | |
| 'harmony_index': harmony, | |
| 'context_index': context, | |
| 'outlier_index': outlier, | |
| 'cai': cai, | |
| 'gc_content': gc, | |
| 'length_bp': len(best_dna), | |
| 'length_aa': len(protein), | |
| }, | |
| 'pareto_front': pareto_solutions[:5], # Top 5 solutions | |
| 'organism': self.organism, | |
| } | |
| def quick_optimize(sequence: str, organism: str = "Escherichia coli K12", | |
| is_protein: bool = True, excluded_sites: List[str] = None, | |
| quality: str = "standard") -> dict: | |
| """ | |
| Quick optimization function with preset configurations. | |
| Args: | |
| sequence: Input sequence (protein or DNA) | |
| organism: Target expression host | |
| is_protein: True if protein sequence, False if DNA | |
| excluded_sites: Restriction sites to avoid | |
| quality: "fast", "standard", or "thorough" | |
| Returns: | |
| Optimization results dictionary | |
| """ | |
| # Quality presets - reduced for web app performance | |
| presets = { | |
| "fast": {"pop_size": 30, "n_gen": 20}, | |
| "standard": {"pop_size": 50, "n_gen": 40}, | |
| "thorough": {"pop_size": 80, "n_gen": 60}, | |
| } | |
| params = presets.get(quality, presets["standard"]) | |
| optimizer = CodonOptimizer(organism, excluded_sites) | |
| return optimizer.optimize( | |
| sequence, is_protein, | |
| pop_size=params["pop_size"], | |
| n_gen=params["n_gen"], | |
| verbose=False | |
| ) | |
| # Simple fallback optimizer for environments without pymoo | |
| class SimpleOptimizer: | |
| """ | |
| Simpler optimization using weighted random selection and hill climbing. | |
| Fallback when pymoo is not available. | |
| """ | |
| def __init__(self, organism: str = "Escherichia coli K12", | |
| excluded_sites: List[str] = None): | |
| self.organism = organism | |
| self.excluded_sites = excluded_sites or [] | |
| self.codon_table = get_codon_table(organism) | |
| def _validate_protein(self, sequence: str) -> str: | |
| valid_aa = set('ACDEFGHIKLMNPQRSTVWY') | |
| cleaned = ''.join(c for c in sequence.upper() if c in valid_aa) | |
| return cleaned | |
| def _validate_dna(self, sequence: str) -> str: | |
| valid_bases = set('ATGC') | |
| return ''.join(c for c in sequence.upper() if c in valid_bases) | |
| def _select_best_codon(self, aa: str) -> str: | |
| """Select the most preferred codon for an amino acid.""" | |
| if aa not in AA_TO_CODONS: | |
| return 'NNN' | |
| synonymous = AA_TO_CODONS[aa] | |
| best_codon = max(synonymous, key=lambda c: self.codon_table.get(c, 0)) | |
| return best_codon | |
| def _check_excluded_sites(self, dna: str) -> List[str]: | |
| """Check for excluded restriction sites.""" | |
| found = [] | |
| for site in self.excluded_sites: | |
| if site.upper() in dna: | |
| found.append(site) | |
| return found | |
| def optimize(self, sequence: str, is_protein: bool = True, | |
| iterations: int = 1000) -> dict: | |
| """ | |
| Optimize using greedy selection with local refinement. | |
| """ | |
| if is_protein: | |
| protein = self._validate_protein(sequence) | |
| else: | |
| dna = self._validate_dna(sequence) | |
| codons = sequence_to_codons(dna) | |
| protein = codons_to_protein(codons) | |
| if len(protein) == 0: | |
| raise ValueError("No valid amino acids found") | |
| # Initial solution: best codon for each position | |
| best_codons = [self._select_best_codon(aa) for aa in protein] | |
| # Initialize indices | |
| harmony_idx = HarmonyIndex(self.organism) | |
| context_idx = CodonContextIndex(self.organism) | |
| outlier_idx = OutlierIndex(self.organism, self.excluded_sites) | |
| def score(codons): | |
| h = harmony_idx.calculate(codons) | |
| c = context_idx.calculate(codons) | |
| o = outlier_idx.calculate(codons) | |
| return h + c - o # Higher is better | |
| best_score = score(best_codons) | |
| # Hill climbing with random restarts | |
| for _ in range(iterations): | |
| # Try a random mutation | |
| pos = random.randint(0, len(protein) - 1) | |
| aa = protein[pos] | |
| if aa not in AA_TO_CODONS: | |
| continue | |
| synonymous = AA_TO_CODONS[aa] | |
| if len(synonymous) <= 1: | |
| continue | |
| # Try alternative codon | |
| current_codon = best_codons[pos] | |
| alternatives = [c for c in synonymous if c != current_codon] | |
| new_codon = random.choice(alternatives) | |
| # Test new solution | |
| test_codons = best_codons.copy() | |
| test_codons[pos] = new_codon | |
| new_score = score(test_codons) | |
| # Check for excluded sites | |
| test_dna = ''.join(test_codons) | |
| has_excluded = any(site.upper() in test_dna for site in self.excluded_sites) | |
| if new_score > best_score and not has_excluded: | |
| best_codons = test_codons | |
| best_score = new_score | |
| # Calculate final metrics | |
| best_dna = ''.join(best_codons) | |
| harmony = harmony_idx.calculate(best_codons) | |
| context = context_idx.calculate(best_codons) | |
| outlier = outlier_idx.calculate(best_codons) | |
| cai = calculate_cai(best_codons, self.codon_table) | |
| gc = calculate_gc_content(best_dna) | |
| return { | |
| 'protein': protein, | |
| 'optimized_dna': best_dna, | |
| 'codons': best_codons, | |
| 'metrics': { | |
| 'harmony_index': harmony, | |
| 'context_index': context, | |
| 'outlier_index': outlier, | |
| 'cai': cai, | |
| 'gc_content': gc, | |
| 'length_bp': len(best_dna), | |
| 'length_aa': len(protein), | |
| }, | |
| 'organism': self.organism, | |
| } | |
| def optimize_sequence(sequence: str, organism: str = "Escherichia coli K12", | |
| is_protein: bool = True, excluded_sites: List[str] = None, | |
| use_nsga3: bool = True, quality: str = "standard") -> dict: | |
| """ | |
| Main entry point for codon optimization. | |
| Args: | |
| sequence: Input protein or DNA sequence | |
| organism: Target host organism | |
| is_protein: True if protein, False if DNA | |
| excluded_sites: Restriction sites to exclude | |
| use_nsga3: Use NSGA-III (requires pymoo) or simple optimizer | |
| quality: "fast", "standard", or "thorough" | |
| Returns: | |
| Optimization results | |
| """ | |
| if use_nsga3: | |
| try: | |
| return quick_optimize(sequence, organism, is_protein, excluded_sites, quality) | |
| except ImportError: | |
| print("pymoo not available, falling back to simple optimizer") | |
| use_nsga3 = False | |
| if not use_nsga3: | |
| iterations = {"fast": 1000, "standard": 3000, "thorough": 5000}.get(quality, 3000) | |
| optimizer = SimpleOptimizer(organism, excluded_sites) | |
| return optimizer.optimize(sequence, is_protein, iterations) | |