codon-optimizer / optimizer.py
joeyisgoed's picture
Upload optimizer.py with huggingface_hub
5942687 verified
"""
NSGA-III Multi-objective Codon Optimization Engine
Based on GenScript patent WO2020024917A1.
Uses pymoo for the NSGA-III algorithm implementation.
"""
import numpy as np
import random
from typing import List, Tuple, Optional
from pymoo.core.problem import Problem
from pymoo.algorithms.moo.nsga3 import NSGA3
from pymoo.util.ref_dirs import get_reference_directions
from pymoo.optimize import minimize
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.mutation.pm import PM
from pymoo.operators.sampling.rnd import FloatRandomSampling
from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table, get_organism_list
from indices import (
HarmonyIndex, CodonContextIndex, OutlierIndex,
calculate_cai, calculate_gc_content, sequence_to_codons,
codons_to_protein, protein_to_codons_random
)
class CodonOptimizationProblem(Problem):
"""
Multi-objective optimization problem for codon optimization.
Objectives:
1. Maximize Harmony Index (minimize negative)
2. Maximize Codon Context Index (minimize negative)
3. Minimize Outlier Index
Decision variables: Real values [0, 1) for each codon position,
mapped to synonymous codon choices.
"""
def __init__(self, protein_sequence: str, organism: str,
excluded_sites: List[str] = None):
self.protein = protein_sequence.upper()
self.organism = organism
self.codon_table = get_codon_table(organism)
self.excluded_sites = excluded_sites or []
# Build codon choices for each position
self.codon_choices = []
for aa in self.protein:
if aa in AA_TO_CODONS:
self.codon_choices.append(AA_TO_CODONS[aa])
else:
# Unknown amino acid - use most common
self.codon_choices.append(['NNN'])
n_vars = len(self.protein)
# Initialize index calculators
# Note: mRNA structure is disabled during optimization for performance
# It will be calculated for the final result only
self.harmony_idx = HarmonyIndex(organism)
self.context_idx = CodonContextIndex(organism)
self.outlier_idx = OutlierIndex(organism, excluded_sites, include_mrna_structure=False)
super().__init__(
n_var=n_vars,
n_obj=3,
xl=np.zeros(n_vars),
xu=np.ones(n_vars),
)
def decode_solution(self, x: np.ndarray) -> List[str]:
"""Convert real-valued solution to codon sequence."""
codons = []
for i, val in enumerate(x):
choices = self.codon_choices[i]
# Map [0, 1) to codon index
idx = int(val * len(choices))
idx = min(idx, len(choices) - 1) # Ensure valid index
codons.append(choices[idx])
return codons
def _evaluate(self, x: np.ndarray, out: dict, *args, **kwargs):
"""Evaluate fitness for population."""
f = np.zeros((x.shape[0], 3))
for i in range(x.shape[0]):
codons = self.decode_solution(x[i])
# Calculate objectives (minimize all, so negate maximization objectives)
harmony = self.harmony_idx.calculate(codons)
context = self.context_idx.calculate(codons)
outlier = self.outlier_idx.calculate(codons)
# Objectives: minimize -harmony, minimize -context, minimize outlier
f[i, 0] = -harmony
f[i, 1] = -context
f[i, 2] = outlier
out["F"] = f
class CodonOptimizer:
"""
Main codon optimization class using NSGA-III algorithm.
"""
def __init__(self, organism: str = "Escherichia coli K12",
excluded_sites: List[str] = None):
self.organism = organism
self.excluded_sites = excluded_sites or []
self.codon_table = get_codon_table(organism)
def _validate_protein(self, sequence: str) -> str:
"""Validate and clean protein sequence."""
valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
cleaned = ''.join(c for c in sequence.upper() if c in valid_aa or c == '*')
# Remove stop codons from internal positions
if cleaned.endswith('*'):
cleaned = cleaned[:-1]
cleaned = cleaned.replace('*', '')
return cleaned
def _validate_dna(self, sequence: str) -> str:
"""Validate and clean DNA sequence."""
valid_bases = set('ATGC')
cleaned = ''.join(c for c in sequence.upper() if c in valid_bases)
return cleaned
def optimize(self, sequence: str, is_protein: bool = True,
pop_size: int = 100, n_gen: int = 100,
verbose: bool = False) -> dict:
"""
Optimize a protein or DNA sequence.
Args:
sequence: Input protein or DNA sequence
is_protein: True if input is protein, False if DNA
pop_size: Population size for genetic algorithm
n_gen: Number of generations
verbose: Print progress
Returns:
Dictionary with optimized sequence and metrics
"""
# Parse input
if is_protein:
protein = self._validate_protein(sequence)
else:
dna = self._validate_dna(sequence)
codons = sequence_to_codons(dna)
protein = codons_to_protein(codons)
if len(protein) == 0:
raise ValueError("No valid amino acids found in sequence")
if verbose:
print(f"Optimizing {len(protein)} amino acids for {self.organism}")
# Create optimization problem
problem = CodonOptimizationProblem(
protein, self.organism, self.excluded_sites
)
# Configure NSGA-III
ref_dirs = get_reference_directions("das-dennis", 3, n_partitions=12)
algorithm = NSGA3(
pop_size=pop_size,
ref_dirs=ref_dirs,
sampling=FloatRandomSampling(),
crossover=SBX(prob=0.9, eta=15),
mutation=PM(eta=20),
eliminate_duplicates=True
)
# Run optimization
result = minimize(
problem,
algorithm,
('n_gen', n_gen),
seed=42,
verbose=verbose
)
# Get best solution (best harmony index)
best_idx = np.argmin(result.F[:, 0]) # Best harmony (most negative = highest)
best_x = result.X[best_idx]
best_codons = problem.decode_solution(best_x)
best_dna = ''.join(best_codons)
# Calculate final metrics
harmony = problem.harmony_idx.calculate(best_codons)
context = problem.context_idx.calculate(best_codons)
outlier = problem.outlier_idx.calculate(best_codons)
cai = calculate_cai(best_codons, self.codon_table)
gc = calculate_gc_content(best_dna)
# Get Pareto front solutions
pareto_solutions = []
for i in range(len(result.X)):
codons = problem.decode_solution(result.X[i])
pareto_solutions.append({
'dna': ''.join(codons),
'harmony': -result.F[i, 0],
'context': -result.F[i, 1],
'outlier': result.F[i, 2],
})
return {
'protein': protein,
'optimized_dna': best_dna,
'codons': best_codons,
'metrics': {
'harmony_index': harmony,
'context_index': context,
'outlier_index': outlier,
'cai': cai,
'gc_content': gc,
'length_bp': len(best_dna),
'length_aa': len(protein),
},
'pareto_front': pareto_solutions[:5], # Top 5 solutions
'organism': self.organism,
}
def quick_optimize(sequence: str, organism: str = "Escherichia coli K12",
is_protein: bool = True, excluded_sites: List[str] = None,
quality: str = "standard") -> dict:
"""
Quick optimization function with preset configurations.
Args:
sequence: Input sequence (protein or DNA)
organism: Target expression host
is_protein: True if protein sequence, False if DNA
excluded_sites: Restriction sites to avoid
quality: "fast", "standard", or "thorough"
Returns:
Optimization results dictionary
"""
# Quality presets - reduced for web app performance
presets = {
"fast": {"pop_size": 30, "n_gen": 20},
"standard": {"pop_size": 50, "n_gen": 40},
"thorough": {"pop_size": 80, "n_gen": 60},
}
params = presets.get(quality, presets["standard"])
optimizer = CodonOptimizer(organism, excluded_sites)
return optimizer.optimize(
sequence, is_protein,
pop_size=params["pop_size"],
n_gen=params["n_gen"],
verbose=False
)
# Simple fallback optimizer for environments without pymoo
class SimpleOptimizer:
"""
Simpler optimization using weighted random selection and hill climbing.
Fallback when pymoo is not available.
"""
def __init__(self, organism: str = "Escherichia coli K12",
excluded_sites: List[str] = None):
self.organism = organism
self.excluded_sites = excluded_sites or []
self.codon_table = get_codon_table(organism)
def _validate_protein(self, sequence: str) -> str:
valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
cleaned = ''.join(c for c in sequence.upper() if c in valid_aa)
return cleaned
def _validate_dna(self, sequence: str) -> str:
valid_bases = set('ATGC')
return ''.join(c for c in sequence.upper() if c in valid_bases)
def _select_best_codon(self, aa: str) -> str:
"""Select the most preferred codon for an amino acid."""
if aa not in AA_TO_CODONS:
return 'NNN'
synonymous = AA_TO_CODONS[aa]
best_codon = max(synonymous, key=lambda c: self.codon_table.get(c, 0))
return best_codon
def _check_excluded_sites(self, dna: str) -> List[str]:
"""Check for excluded restriction sites."""
found = []
for site in self.excluded_sites:
if site.upper() in dna:
found.append(site)
return found
def optimize(self, sequence: str, is_protein: bool = True,
iterations: int = 1000) -> dict:
"""
Optimize using greedy selection with local refinement.
"""
if is_protein:
protein = self._validate_protein(sequence)
else:
dna = self._validate_dna(sequence)
codons = sequence_to_codons(dna)
protein = codons_to_protein(codons)
if len(protein) == 0:
raise ValueError("No valid amino acids found")
# Initial solution: best codon for each position
best_codons = [self._select_best_codon(aa) for aa in protein]
# Initialize indices
harmony_idx = HarmonyIndex(self.organism)
context_idx = CodonContextIndex(self.organism)
outlier_idx = OutlierIndex(self.organism, self.excluded_sites)
def score(codons):
h = harmony_idx.calculate(codons)
c = context_idx.calculate(codons)
o = outlier_idx.calculate(codons)
return h + c - o # Higher is better
best_score = score(best_codons)
# Hill climbing with random restarts
for _ in range(iterations):
# Try a random mutation
pos = random.randint(0, len(protein) - 1)
aa = protein[pos]
if aa not in AA_TO_CODONS:
continue
synonymous = AA_TO_CODONS[aa]
if len(synonymous) <= 1:
continue
# Try alternative codon
current_codon = best_codons[pos]
alternatives = [c for c in synonymous if c != current_codon]
new_codon = random.choice(alternatives)
# Test new solution
test_codons = best_codons.copy()
test_codons[pos] = new_codon
new_score = score(test_codons)
# Check for excluded sites
test_dna = ''.join(test_codons)
has_excluded = any(site.upper() in test_dna for site in self.excluded_sites)
if new_score > best_score and not has_excluded:
best_codons = test_codons
best_score = new_score
# Calculate final metrics
best_dna = ''.join(best_codons)
harmony = harmony_idx.calculate(best_codons)
context = context_idx.calculate(best_codons)
outlier = outlier_idx.calculate(best_codons)
cai = calculate_cai(best_codons, self.codon_table)
gc = calculate_gc_content(best_dna)
return {
'protein': protein,
'optimized_dna': best_dna,
'codons': best_codons,
'metrics': {
'harmony_index': harmony,
'context_index': context,
'outlier_index': outlier,
'cai': cai,
'gc_content': gc,
'length_bp': len(best_dna),
'length_aa': len(protein),
},
'organism': self.organism,
}
def optimize_sequence(sequence: str, organism: str = "Escherichia coli K12",
is_protein: bool = True, excluded_sites: List[str] = None,
use_nsga3: bool = True, quality: str = "standard") -> dict:
"""
Main entry point for codon optimization.
Args:
sequence: Input protein or DNA sequence
organism: Target host organism
is_protein: True if protein, False if DNA
excluded_sites: Restriction sites to exclude
use_nsga3: Use NSGA-III (requires pymoo) or simple optimizer
quality: "fast", "standard", or "thorough"
Returns:
Optimization results
"""
if use_nsga3:
try:
return quick_optimize(sequence, organism, is_protein, excluded_sites, quality)
except ImportError:
print("pymoo not available, falling back to simple optimizer")
use_nsga3 = False
if not use_nsga3:
iterations = {"fast": 1000, "standard": 3000, "thorough": 5000}.get(quality, 3000)
optimizer = SimpleOptimizer(organism, excluded_sites)
return optimizer.optimize(sequence, is_protein, iterations)