Spaces:
Sleeping
Sleeping
File size: 14,493 Bytes
8715adc 2d2975e 8715adc 2d2975e 8715adc 2d2975e 8715adc 2d2975e 8715adc 5942687 8715adc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 | """
NSGA-III Multi-objective Codon Optimization Engine
Based on GenScript patent WO2020024917A1.
Uses pymoo for the NSGA-III algorithm implementation.
"""
import numpy as np
import random
from typing import List, Tuple, Optional
from pymoo.core.problem import Problem
from pymoo.algorithms.moo.nsga3 import NSGA3
from pymoo.util.ref_dirs import get_reference_directions
from pymoo.optimize import minimize
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.mutation.pm import PM
from pymoo.operators.sampling.rnd import FloatRandomSampling
from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table, get_organism_list
from indices import (
HarmonyIndex, CodonContextIndex, OutlierIndex,
calculate_cai, calculate_gc_content, sequence_to_codons,
codons_to_protein, protein_to_codons_random
)
class CodonOptimizationProblem(Problem):
"""
Multi-objective optimization problem for codon optimization.
Objectives:
1. Maximize Harmony Index (minimize negative)
2. Maximize Codon Context Index (minimize negative)
3. Minimize Outlier Index
Decision variables: Real values [0, 1) for each codon position,
mapped to synonymous codon choices.
"""
def __init__(self, protein_sequence: str, organism: str,
excluded_sites: List[str] = None):
self.protein = protein_sequence.upper()
self.organism = organism
self.codon_table = get_codon_table(organism)
self.excluded_sites = excluded_sites or []
# Build codon choices for each position
self.codon_choices = []
for aa in self.protein:
if aa in AA_TO_CODONS:
self.codon_choices.append(AA_TO_CODONS[aa])
else:
# Unknown amino acid - use most common
self.codon_choices.append(['NNN'])
n_vars = len(self.protein)
# Initialize index calculators
# Note: mRNA structure is disabled during optimization for performance
# It will be calculated for the final result only
self.harmony_idx = HarmonyIndex(organism)
self.context_idx = CodonContextIndex(organism)
self.outlier_idx = OutlierIndex(organism, excluded_sites, include_mrna_structure=False)
super().__init__(
n_var=n_vars,
n_obj=3,
xl=np.zeros(n_vars),
xu=np.ones(n_vars),
)
def decode_solution(self, x: np.ndarray) -> List[str]:
"""Convert real-valued solution to codon sequence."""
codons = []
for i, val in enumerate(x):
choices = self.codon_choices[i]
# Map [0, 1) to codon index
idx = int(val * len(choices))
idx = min(idx, len(choices) - 1) # Ensure valid index
codons.append(choices[idx])
return codons
def _evaluate(self, x: np.ndarray, out: dict, *args, **kwargs):
"""Evaluate fitness for population."""
f = np.zeros((x.shape[0], 3))
for i in range(x.shape[0]):
codons = self.decode_solution(x[i])
# Calculate objectives (minimize all, so negate maximization objectives)
harmony = self.harmony_idx.calculate(codons)
context = self.context_idx.calculate(codons)
outlier = self.outlier_idx.calculate(codons)
# Objectives: minimize -harmony, minimize -context, minimize outlier
f[i, 0] = -harmony
f[i, 1] = -context
f[i, 2] = outlier
out["F"] = f
class CodonOptimizer:
"""
Main codon optimization class using NSGA-III algorithm.
"""
def __init__(self, organism: str = "Escherichia coli K12",
excluded_sites: List[str] = None):
self.organism = organism
self.excluded_sites = excluded_sites or []
self.codon_table = get_codon_table(organism)
def _validate_protein(self, sequence: str) -> str:
"""Validate and clean protein sequence."""
valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
cleaned = ''.join(c for c in sequence.upper() if c in valid_aa or c == '*')
# Remove stop codons from internal positions
if cleaned.endswith('*'):
cleaned = cleaned[:-1]
cleaned = cleaned.replace('*', '')
return cleaned
def _validate_dna(self, sequence: str) -> str:
"""Validate and clean DNA sequence."""
valid_bases = set('ATGC')
cleaned = ''.join(c for c in sequence.upper() if c in valid_bases)
return cleaned
def optimize(self, sequence: str, is_protein: bool = True,
pop_size: int = 100, n_gen: int = 100,
verbose: bool = False) -> dict:
"""
Optimize a protein or DNA sequence.
Args:
sequence: Input protein or DNA sequence
is_protein: True if input is protein, False if DNA
pop_size: Population size for genetic algorithm
n_gen: Number of generations
verbose: Print progress
Returns:
Dictionary with optimized sequence and metrics
"""
# Parse input
if is_protein:
protein = self._validate_protein(sequence)
else:
dna = self._validate_dna(sequence)
codons = sequence_to_codons(dna)
protein = codons_to_protein(codons)
if len(protein) == 0:
raise ValueError("No valid amino acids found in sequence")
if verbose:
print(f"Optimizing {len(protein)} amino acids for {self.organism}")
# Create optimization problem
problem = CodonOptimizationProblem(
protein, self.organism, self.excluded_sites
)
# Configure NSGA-III
ref_dirs = get_reference_directions("das-dennis", 3, n_partitions=12)
algorithm = NSGA3(
pop_size=pop_size,
ref_dirs=ref_dirs,
sampling=FloatRandomSampling(),
crossover=SBX(prob=0.9, eta=15),
mutation=PM(eta=20),
eliminate_duplicates=True
)
# Run optimization
result = minimize(
problem,
algorithm,
('n_gen', n_gen),
seed=42,
verbose=verbose
)
# Get best solution (best harmony index)
best_idx = np.argmin(result.F[:, 0]) # Best harmony (most negative = highest)
best_x = result.X[best_idx]
best_codons = problem.decode_solution(best_x)
best_dna = ''.join(best_codons)
# Calculate final metrics
harmony = problem.harmony_idx.calculate(best_codons)
context = problem.context_idx.calculate(best_codons)
outlier = problem.outlier_idx.calculate(best_codons)
cai = calculate_cai(best_codons, self.codon_table)
gc = calculate_gc_content(best_dna)
# Get Pareto front solutions
pareto_solutions = []
for i in range(len(result.X)):
codons = problem.decode_solution(result.X[i])
pareto_solutions.append({
'dna': ''.join(codons),
'harmony': -result.F[i, 0],
'context': -result.F[i, 1],
'outlier': result.F[i, 2],
})
return {
'protein': protein,
'optimized_dna': best_dna,
'codons': best_codons,
'metrics': {
'harmony_index': harmony,
'context_index': context,
'outlier_index': outlier,
'cai': cai,
'gc_content': gc,
'length_bp': len(best_dna),
'length_aa': len(protein),
},
'pareto_front': pareto_solutions[:5], # Top 5 solutions
'organism': self.organism,
}
def quick_optimize(sequence: str, organism: str = "Escherichia coli K12",
is_protein: bool = True, excluded_sites: List[str] = None,
quality: str = "standard") -> dict:
"""
Quick optimization function with preset configurations.
Args:
sequence: Input sequence (protein or DNA)
organism: Target expression host
is_protein: True if protein sequence, False if DNA
excluded_sites: Restriction sites to avoid
quality: "fast", "standard", or "thorough"
Returns:
Optimization results dictionary
"""
# Quality presets - reduced for web app performance
presets = {
"fast": {"pop_size": 30, "n_gen": 20},
"standard": {"pop_size": 50, "n_gen": 40},
"thorough": {"pop_size": 80, "n_gen": 60},
}
params = presets.get(quality, presets["standard"])
optimizer = CodonOptimizer(organism, excluded_sites)
return optimizer.optimize(
sequence, is_protein,
pop_size=params["pop_size"],
n_gen=params["n_gen"],
verbose=False
)
# Simple fallback optimizer for environments without pymoo
class SimpleOptimizer:
"""
Simpler optimization using weighted random selection and hill climbing.
Fallback when pymoo is not available.
"""
def __init__(self, organism: str = "Escherichia coli K12",
excluded_sites: List[str] = None):
self.organism = organism
self.excluded_sites = excluded_sites or []
self.codon_table = get_codon_table(organism)
def _validate_protein(self, sequence: str) -> str:
valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
cleaned = ''.join(c for c in sequence.upper() if c in valid_aa)
return cleaned
def _validate_dna(self, sequence: str) -> str:
valid_bases = set('ATGC')
return ''.join(c for c in sequence.upper() if c in valid_bases)
def _select_best_codon(self, aa: str) -> str:
"""Select the most preferred codon for an amino acid."""
if aa not in AA_TO_CODONS:
return 'NNN'
synonymous = AA_TO_CODONS[aa]
best_codon = max(synonymous, key=lambda c: self.codon_table.get(c, 0))
return best_codon
def _check_excluded_sites(self, dna: str) -> List[str]:
"""Check for excluded restriction sites."""
found = []
for site in self.excluded_sites:
if site.upper() in dna:
found.append(site)
return found
def optimize(self, sequence: str, is_protein: bool = True,
iterations: int = 1000) -> dict:
"""
Optimize using greedy selection with local refinement.
"""
if is_protein:
protein = self._validate_protein(sequence)
else:
dna = self._validate_dna(sequence)
codons = sequence_to_codons(dna)
protein = codons_to_protein(codons)
if len(protein) == 0:
raise ValueError("No valid amino acids found")
# Initial solution: best codon for each position
best_codons = [self._select_best_codon(aa) for aa in protein]
# Initialize indices
harmony_idx = HarmonyIndex(self.organism)
context_idx = CodonContextIndex(self.organism)
outlier_idx = OutlierIndex(self.organism, self.excluded_sites)
def score(codons):
h = harmony_idx.calculate(codons)
c = context_idx.calculate(codons)
o = outlier_idx.calculate(codons)
return h + c - o # Higher is better
best_score = score(best_codons)
# Hill climbing with random restarts
for _ in range(iterations):
# Try a random mutation
pos = random.randint(0, len(protein) - 1)
aa = protein[pos]
if aa not in AA_TO_CODONS:
continue
synonymous = AA_TO_CODONS[aa]
if len(synonymous) <= 1:
continue
# Try alternative codon
current_codon = best_codons[pos]
alternatives = [c for c in synonymous if c != current_codon]
new_codon = random.choice(alternatives)
# Test new solution
test_codons = best_codons.copy()
test_codons[pos] = new_codon
new_score = score(test_codons)
# Check for excluded sites
test_dna = ''.join(test_codons)
has_excluded = any(site.upper() in test_dna for site in self.excluded_sites)
if new_score > best_score and not has_excluded:
best_codons = test_codons
best_score = new_score
# Calculate final metrics
best_dna = ''.join(best_codons)
harmony = harmony_idx.calculate(best_codons)
context = context_idx.calculate(best_codons)
outlier = outlier_idx.calculate(best_codons)
cai = calculate_cai(best_codons, self.codon_table)
gc = calculate_gc_content(best_dna)
return {
'protein': protein,
'optimized_dna': best_dna,
'codons': best_codons,
'metrics': {
'harmony_index': harmony,
'context_index': context,
'outlier_index': outlier,
'cai': cai,
'gc_content': gc,
'length_bp': len(best_dna),
'length_aa': len(protein),
},
'organism': self.organism,
}
def optimize_sequence(sequence: str, organism: str = "Escherichia coli K12",
is_protein: bool = True, excluded_sites: List[str] = None,
use_nsga3: bool = True, quality: str = "standard") -> dict:
"""
Main entry point for codon optimization.
Args:
sequence: Input protein or DNA sequence
organism: Target host organism
is_protein: True if protein, False if DNA
excluded_sites: Restriction sites to exclude
use_nsga3: Use NSGA-III (requires pymoo) or simple optimizer
quality: "fast", "standard", or "thorough"
Returns:
Optimization results
"""
if use_nsga3:
try:
return quick_optimize(sequence, organism, is_protein, excluded_sites, quality)
except ImportError:
print("pymoo not available, falling back to simple optimizer")
use_nsga3 = False
if not use_nsga3:
iterations = {"fast": 1000, "standard": 3000, "thorough": 5000}.get(quality, 3000)
optimizer = SimpleOptimizer(organism, excluded_sites)
return optimizer.optimize(sequence, is_protein, iterations)
|