Spaces:

joeyisgoed
/

codon-optimizer

Sleeping

App Files Files Community

codon-optimizer / indices.py

joeyisgoed

Fix GC content: add organism-specific targets and stronger penalties

34c14c8 verified 2 months ago

raw

history blame contribute delete

23.3 kB

	"""
	Optimization indices based on GenScript patent WO2020024917A1.
	Implements Harmony Index, Codon Context Index, and Outlier Index.
	Includes mRNA secondary structure prediction using seqfold.
	"""

	import math
	import re
	from collections import Counter
	from typing import Dict, List, Tuple, Optional
	from codon_tables import CODON_TO_AA, AA_TO_CODONS, get_codon_table

	# Try to import seqfold for mRNA structure prediction
	try:
	from seqfold import dg, fold, dot_bracket
	SEQFOLD_AVAILABLE = True
	except ImportError:
	SEQFOLD_AVAILABLE = False


	def cosine_distance(v1: List[float], v2: List[float]) -> float:
	"""Calculate cosine distance between two vectors."""
	dot_product = sum(a * b for a, b in zip(v1, v2))
	norm1 = math.sqrt(sum(a * a for a in v1))
	norm2 = math.sqrt(sum(b * b for b in v2))

	if norm1 == 0 or norm2 == 0:
	return 1.0

	similarity = dot_product / (norm1 * norm2)
	return 1.0 - similarity


	def euclidean_distance(v1: List[float], v2: List[float]) -> float:
	"""Calculate normalized Euclidean distance between two vectors."""
	if len(v1) != len(v2):
	raise ValueError("Vectors must have same length")

	squared_diff = sum((a - b) ** 2 for a, b in zip(v1, v2))
	max_dist = math.sqrt(len(v1)) # Max possible distance when all elements differ by 1

	return math.sqrt(squared_diff) / max_dist if max_dist > 0 else 0


	def sequence_to_codons(dna_sequence: str) -> List[str]:
	"""Convert DNA sequence to list of codons."""
	dna_sequence = dna_sequence.upper().replace(" ", "").replace("\n", "")
	codons = []
	for i in range(0, len(dna_sequence) - 2, 3):
	codons.append(dna_sequence[i:i+3])
	return codons


	def codons_to_protein(codons: List[str]) -> str:
	"""Convert codon list to protein sequence."""
	protein = ""
	for codon in codons:
	aa = CODON_TO_AA.get(codon, 'X')
	if aa != '*':
	protein += aa
	return protein


	def protein_to_codons_random(protein: str, codon_table: dict) -> List[str]:
	"""Convert protein to codons using weighted random selection based on usage table."""
	import random
	codons = []
	for aa in protein.upper():
	if aa not in AA_TO_CODONS:
	continue
	synonymous = AA_TO_CODONS[aa]
	# Weight by codon usage
	weights = [codon_table.get(c, 0.1) for c in synonymous]
	total = sum(weights)
	weights = [w / total for w in weights]
	codon = random.choices(synonymous, weights=weights, k=1)[0]
	codons.append(codon)
	return codons


	class HarmonyIndex:
	"""
	Harmony Index (H) measures how well the codon usage in the candidate sequence
	matches the codon usage distribution of highly-expressed genes in the host.

	H = 1 - D(F_hs, F_ts)

	Where:
	- F_hs = codon frequency vector from highly-expressed genes (reference)
	- F_ts = codon frequency vector from target sequence
	- D = distance function (cosine or euclidean)
	"""

	def __init__(self, organism: str):
	self.codon_table = get_codon_table(organism)
	self.reference_freqs = self._build_reference_vector()

	def _build_reference_vector(self) -> List[float]:
	"""Build reference frequency vector from codon table (59 codons, excluding stops and single-codon AAs)."""
	# All codons except stop codons
	codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*']
	return [self.codon_table.get(c, 0) for c in sorted(codons)]

	def _build_sequence_vector(self, codons: List[str]) -> List[float]:
	"""Build frequency vector from a codon sequence."""
	# Count codons
	codon_counts = Counter(codons)
	total = len(codons)

	if total == 0:
	return [0.0] * 61 # 64 - 3 stop codons

	# Build vector matching reference order (all codons except stops)
	all_codons = [c for c in self.codon_table.keys() if CODON_TO_AA.get(c) != '*']

	# Calculate frequencies per amino acid group
	freqs = []
	for codon in sorted(all_codons):
	aa = CODON_TO_AA[codon]
	synonymous = AA_TO_CODONS[aa]
	aa_total = sum(codon_counts.get(c, 0) for c in synonymous)
	if aa_total > 0:
	freq = codon_counts.get(codon, 0) / aa_total
	else:
	freq = 0.0
	freqs.append(freq)

	return freqs

	def calculate(self, codons: List[str]) -> float:
	"""
	Calculate Harmony Index for a codon sequence.
	Returns value between 0 and 1, where 1 is perfect harmony.
	"""
	target_freqs = self._build_sequence_vector(codons)
	distance = cosine_distance(self.reference_freqs, target_freqs)
	return 1.0 - distance


	class CodonContextIndex:
	"""
	Codon Context Index (CC) evaluates how well codon pairs in the candidate
	sequence match the codon pair preferences of highly-expressed genes.

	CC = 1 - D(F_hcc, F_tcc)

	Where:
	- F_hcc = codon-pair frequency vector from reference
	- F_tcc = codon-pair frequency vector from target sequence
	"""

	def __init__(self, organism: str):
	self.organism = organism
	self.codon_table = get_codon_table(organism)
	# For simplicity, we use a basic model based on individual codon preferences
	# A full implementation would use experimental codon pair frequency data

	def _calculate_pair_score(self, codon1: str, codon2: str) -> float:
	"""Calculate preference score for a codon pair based on individual codon usage."""
	freq1 = self.codon_table.get(codon1, 0.1)
	freq2 = self.codon_table.get(codon2, 0.1)
	# Geometric mean of frequencies as a simple pair score
	return math.sqrt(freq1 * freq2)

	def calculate(self, codons: List[str]) -> float:
	"""
	Calculate Codon Context Index.
	Returns value between 0 and 1, where 1 is optimal context.
	"""
	if len(codons) < 2:
	return 1.0

	total_score = 0.0
	max_score = 0.0

	for i in range(len(codons) - 1):
	pair_score = self._calculate_pair_score(codons[i], codons[i + 1])
	total_score += pair_score
	max_score += 1.0 # Maximum possible score per pair

	if max_score == 0:
	return 1.0

	return total_score / max_score


	# =============================================================================
	# mRNA Secondary Structure Analysis
	# =============================================================================

	class mRNAStructureAnalyzer:
	"""
	Analyzes mRNA secondary structure using seqfold.

	Key metrics:
	- 5' end folding energy (critical for translation initiation)
	- Full sequence minimum free energy (MFE)
	- Hairpin detection in sliding windows
	"""

	def __init__(self, temperature: float = 37.0):
	"""
	Initialize analyzer.

	Args:
	temperature: Temperature in Celsius for folding calculations
	"""
	self.temperature = temperature
	self.available = SEQFOLD_AVAILABLE

	def dna_to_rna(self, dna_seq: str) -> str:
	"""Convert DNA sequence to RNA (T -> U)."""
	return dna_seq.upper().replace('T', 'U')

	def calculate_mfe(self, sequence: str, is_rna: bool = False, max_length: int = 200) -> float:
	"""
	Calculate minimum free energy (MFE) for a sequence.

	Args:
	sequence: DNA or RNA sequence
	is_rna: If True, sequence is already RNA
	max_length: Maximum sequence length to analyze (longer sequences are truncated)

	Returns:
	MFE in kcal/mol (more negative = more stable structure)
	"""
	if not self.available:
	return 0.0

	if not is_rna:
	sequence = self.dna_to_rna(sequence)

	if len(sequence) < 4:
	return 0.0

	# Truncate long sequences for performance (RNA folding is O(n³))
	if len(sequence) > max_length:
	sequence = sequence[:max_length]

	try:
	return dg(sequence, temp=self.temperature)
	except Exception:
	return 0.0

	def calculate_5prime_mfe(self, sequence: str, window_size: int = 50) -> float:
	"""
	Calculate MFE for the 5' region (critical for translation initiation).

	Stable structures at the 5' end can reduce translation by up to 250-fold.

	Args:
	sequence: DNA sequence
	window_size: Number of nucleotides from 5' end to analyze

	Returns:
	MFE of 5' region in kcal/mol
	"""
	if len(sequence) < window_size:
	return self.calculate_mfe(sequence)

	return self.calculate_mfe(sequence[:window_size])

	def find_hairpins(self, sequence: str, window_size: int = 60,
	step: int = 30, threshold: float = -15.0) -> List[dict]:
	"""
	Find potential hairpin structures using sliding window analysis.

	Args:
	sequence: DNA sequence
	window_size: Size of sliding window
	step: Step size for window
	threshold: MFE threshold for flagging (kcal/mol)

	Returns:
	List of hairpin regions with position and energy
	"""
	if not self.available:
	return []

	hairpins = []
	rna_seq = self.dna_to_rna(sequence)

	for i in range(0, len(rna_seq) - window_size + 1, step):
	window = rna_seq[i:i + window_size]
	try:
	energy = dg(window, temp=self.temperature)
	if energy < threshold:
	hairpins.append({
	'start': i,
	'end': i + window_size,
	'energy': energy,
	'sequence': sequence[i:i + window_size]
	})
	except Exception:
	continue

	return hairpins

	def get_structure_string(self, sequence: str) -> str:
	"""
	Get dot-bracket notation of predicted structure.

	Args:
	sequence: DNA sequence

	Returns:
	Dot-bracket string (e.g., "(((...)))")
	"""
	if not self.available:
	return ""

	rna_seq = self.dna_to_rna(sequence)

	try:
	structs = fold(rna_seq)
	return dot_bracket(structs)
	except Exception:
	return ""

	def analyze(self, sequence: str, fast_mode: bool = True) -> dict:
	"""
	Perform mRNA structure analysis.

	Args:
	sequence: DNA sequence
	fast_mode: If True, only analyze critical regions (faster)

	Returns:
	Dictionary with structure metrics
	"""
	if not self.available:
	return {
	'available': False,
	'full_mfe': 0.0,
	'five_prime_mfe': 0.0,
	'hairpin_count': 0,
	'hairpins': [],
	'structure_penalty': 0.0,
	}

	# Always calculate 5' MFE (most important for translation)
	five_prime_mfe = self.calculate_5prime_mfe(sequence)

	# For full MFE, only analyze first 200 nt in fast mode
	if fast_mode:
	full_mfe = self.calculate_mfe(sequence[:min(200, len(sequence))])
	# Skip hairpin detection in fast mode for long sequences
	if len(sequence) > 300:
	hairpins = []
	else:
	hairpins = self.find_hairpins(sequence, window_size=40, step=40, threshold=-12.0)
	else:
	full_mfe = self.calculate_mfe(sequence, max_length=500)
	hairpins = self.find_hairpins(sequence)

	# Calculate structure penalty for optimization
	# Penalize stable 5' structures (< -30 kcal/mol is problematic)
	five_prime_penalty = 0.0
	if five_prime_mfe < -30:
	five_prime_penalty = (-30 - five_prime_mfe) / 10 # Scaled penalty

	# Penalize multiple hairpins
	hairpin_penalty = len(hairpins) * 0.1

	# Total structure penalty
	structure_penalty = five_prime_penalty + hairpin_penalty

	return {
	'available': True,
	'full_mfe': full_mfe,
	'five_prime_mfe': five_prime_mfe,
	'hairpin_count': len(hairpins),
	'hairpins': hairpins,
	'structure_penalty': structure_penalty,
	}


	class OutlierIndex:
	"""
	Outlier Index (OI) quantifies negative sequence features that should be minimized.

	OI = sum(w_i * f_i(x)) for various adverse features

	Features include:
	- GC content deviation from optimal range (organism-specific)
	- Repetitive sequences
	- Rare codon clusters
	- mRNA secondary structure (optional - expensive to compute)
	- Adverse motifs (restriction sites, splice sites, etc.)
	"""

	# Weights for different features - GC penalties significantly increased
	WEIGHTS = {
	'gc_content': 8.0, # Increased from 2.0 - global GC penalty
	'gc_local': 4.0, # Increased from 1.5 - local window GC penalty
	'gc_extreme': 10.0, # NEW - exponential penalty for extreme GC
	'rare_codons': 2.0,
	'repeats': 1.0,
	'homopolymers': 1.5,
	'motifs': 2.0,
	'mrna_structure': 2.5,
	}

	# Organism-specific optimal GC content ranges
	# Based on typical GC content of highly-expressed genes
	GC_TARGETS = {
	"Escherichia coli K12": (0.45, 0.55), # E. coli prefers moderate GC
	"Homo sapiens": (0.45, 0.60), # Human - moderate to slightly high
	"CHO (Chinese Hamster Ovary)": (0.45, 0.60),
	"Saccharomyces cerevisiae": (0.35, 0.45), # Yeast prefers lower GC
	"Mus musculus": (0.45, 0.60), # Mouse similar to human
	"Insect (Spodoptera frugiperda)": (0.40, 0.55),
	"Pichia pastoris": (0.38, 0.48),
	"Bacillus subtilis": (0.40, 0.50),
	"Drosophila melanogaster": (0.45, 0.58),
	"Caenorhabditis elegans": (0.35, 0.45),
	}
	DEFAULT_GC_TARGET = (0.40, 0.55) # Default if organism not found

	# Adverse motifs to avoid
	ADVERSE_MOTIFS = [
	# Shine-Dalgarno sequences (prokaryotic RBS)
	'AGGAGG', 'GGAGG', 'AGGAG', 'GAGG',
	# Common restriction sites
	'GAATTC', # EcoRI
	'GGATCC', # BamHI
	'AAGCTT', # HindIII
	'CTGCAG', # PstI
	'GCGGCCGC', # NotI
	'CATATG', # NdeI
	# Splice site consensus
	'GGTAAG', 'GGTGAG',
	# Poly-A signals
	'AATAAA', 'ATTAAA',
	# AU-rich elements
	'ATTTA',
	]

	def __init__(self, organism: str, excluded_sites: List[str] = None,
	include_mrna_structure: bool = False):
	"""
	Initialize OutlierIndex.

	Args:
	organism: Target host organism
	excluded_sites: Restriction sites to avoid
	include_mrna_structure: If True, include mRNA folding in calculations
	(expensive - set False for optimization iterations)
	"""
	self.organism = organism
	self.codon_table = get_codon_table(organism)
	self.rare_threshold = 0.10 # Codons with frequency < 10% considered rare
	self.excluded_sites = excluded_sites or []
	self.include_mrna_structure = include_mrna_structure
	self.mrna_analyzer = mRNAStructureAnalyzer() if include_mrna_structure else None

	# Get organism-specific GC target range
	self.gc_target = self.GC_TARGETS.get(organism, self.DEFAULT_GC_TARGET)
	self.gc_min, self.gc_max = self.gc_target

	def _gc_content_penalty(self, sequence: str) -> float:
	"""Calculate penalty for GC content outside organism-specific optimal range."""
	if len(sequence) == 0:
	return 0.0

	gc_count = sequence.count('G') + sequence.count('C')
	gc_percent = gc_count / len(sequence)

	# Use organism-specific optimal range
	if self.gc_min <= gc_percent <= self.gc_max:
	return 0.0
	elif gc_percent < self.gc_min:
	return (self.gc_min - gc_percent) * 3 # Increased multiplier
	else:
	return (gc_percent - self.gc_max) * 3 # Increased multiplier

	def _gc_extreme_penalty(self, sequence: str) -> float:
	"""
	Calculate exponential penalty for extreme GC content.
	Heavily penalizes GC outside 30-70% range.
	"""
	if len(sequence) == 0:
	return 0.0

	gc_count = sequence.count('G') + sequence.count('C')
	gc_percent = gc_count / len(sequence)

	# No penalty within acceptable range (organism target ± 15%)
	soft_min = max(0.30, self.gc_min - 0.10)
	soft_max = min(0.70, self.gc_max + 0.10)

	if soft_min <= gc_percent <= soft_max:
	return 0.0

	# Exponential penalty outside soft range
	if gc_percent < soft_min:
	deviation = soft_min - gc_percent
	else:
	deviation = gc_percent - soft_max

	# Exponential: penalty grows rapidly with deviation
	return (deviation * 10) ** 2

	def _gc_local_penalty(self, sequence: str, window_size: int = 60) -> float:
	"""Calculate penalty for local GC content outside acceptable range."""
	if len(sequence) < window_size:
	return 0.0

	# Local range is organism target ± 10%
	local_min = max(0.25, self.gc_min - 0.10)
	local_max = min(0.75, self.gc_max + 0.10)

	penalties = []
	for i in range(0, len(sequence) - window_size + 1, 10):
	window = sequence[i:i + window_size]
	gc_count = window.count('G') + window.count('C')
	gc_percent = gc_count / len(window)

	if gc_percent < local_min:
	penalties.append((local_min - gc_percent) * 2)
	elif gc_percent > local_max:
	penalties.append((gc_percent - local_max) * 2)

	return sum(penalties) / max(len(penalties), 1) if penalties else 0.0

	def _rare_codon_penalty(self, codons: List[str]) -> float:
	"""Calculate penalty for rare codon usage."""
	if len(codons) == 0:
	return 0.0

	rare_count = 0
	for codon in codons:
	freq = self.codon_table.get(codon, 0)
	if freq < self.rare_threshold:
	rare_count += 1

	return rare_count / len(codons)

	def _repeat_penalty(self, sequence: str) -> float:
	"""Calculate penalty for direct repeats (>= 8bp)."""
	penalty = 0.0
	# Check for repeats
	for length in range(8, min(20, len(sequence) // 2)):
	for i in range(len(sequence) - 2 * length + 1):
	pattern = sequence[i:i + length]
	if pattern in sequence[i + length:]:
	penalty += 0.1

	return min(penalty, 1.0)

	def _homopolymer_penalty(self, sequence: str) -> float:
	"""Calculate penalty for homopolymer runs (>= 5 same nucleotides)."""
	penalty = 0.0
	for base in 'ATGC':
	runs = re.findall(f'{base}{{5,}}', sequence)
	for run in runs:
	penalty += (len(run) - 4) * 0.1

	return min(penalty, 1.0)

	def _motif_penalty(self, sequence: str) -> float:
	"""Calculate penalty for adverse motifs."""
	penalty = 0.0
	all_motifs = self.ADVERSE_MOTIFS + self.excluded_sites

	for motif in all_motifs:
	count = sequence.count(motif)
	penalty += count * 0.2

	return min(penalty, 1.0)

	def _mrna_structure_penalty(self, sequence: str) -> float:
	"""Calculate penalty for problematic mRNA secondary structures."""
	if not self.include_mrna_structure or self.mrna_analyzer is None:
	return 0.0

	if not self.mrna_analyzer.available:
	return 0.0

	analysis = self.mrna_analyzer.analyze(sequence)
	return analysis['structure_penalty']

	def calculate(self, codons: List[str]) -> float:
	"""
	Calculate Outlier Index for a codon sequence.
	Returns value >= 0, where 0 is optimal (no adverse features).
	Lower is better.
	"""
	sequence = ''.join(codons)

	penalties = {
	'gc_content': self._gc_content_penalty(sequence),
	'gc_extreme': self._gc_extreme_penalty(sequence),
	'gc_local': self._gc_local_penalty(sequence),
	'rare_codons': self._rare_codon_penalty(codons),
	'repeats': self._repeat_penalty(sequence),
	'homopolymers': self._homopolymer_penalty(sequence),
	'motifs': self._motif_penalty(sequence),
	'mrna_structure': self._mrna_structure_penalty(sequence),
	}

	total = sum(self.WEIGHTS[k] * v for k, v in penalties.items())
	return total

	def calculate_detailed(self, codons: List[str]) -> dict:
	"""
	Calculate Outlier Index with detailed breakdown.
	Returns dictionary with all penalty components and mRNA structure analysis.
	"""
	sequence = ''.join(codons)

	penalties = {
	'gc_content': self._gc_content_penalty(sequence),
	'gc_extreme': self._gc_extreme_penalty(sequence),
	'gc_local': self._gc_local_penalty(sequence),
	'rare_codons': self._rare_codon_penalty(codons),
	'repeats': self._repeat_penalty(sequence),
	'homopolymers': self._homopolymer_penalty(sequence),
	'motifs': self._motif_penalty(sequence),
	'mrna_structure': self._mrna_structure_penalty(sequence),
	}

	total = sum(self.WEIGHTS[k] * v for k, v in penalties.items())

	# Get detailed mRNA analysis
	mrna_analysis = self.mrna_analyzer.analyze(sequence)

	return {
	'total': total,
	'penalties': penalties,
	'weights': self.WEIGHTS,
	'mrna_structure': mrna_analysis,
	}


	def calculate_cai(codons: List[str], codon_table: dict) -> float:
	"""
	Calculate Codon Adaptation Index (CAI).
	CAI is the geometric mean of relative adaptiveness values.
	"""
	if len(codons) == 0:
	return 0.0

	# Calculate relative adaptiveness for each codon
	w_values = []
	for codon in codons:
	aa = CODON_TO_AA.get(codon)
	if aa is None or aa == '*':
	continue

	# Get max frequency among synonymous codons
	synonymous = AA_TO_CODONS[aa]
	max_freq = max(codon_table.get(c, 0.01) for c in synonymous)

	# Relative adaptiveness
	freq = codon_table.get(codon, 0.01)
	w = freq / max_freq if max_freq > 0 else 0.01
	w_values.append(max(w, 0.01)) # Avoid log(0)

	if len(w_values) == 0:
	return 0.0

	# Geometric mean
	log_sum = sum(math.log(w) for w in w_values)
	cai = math.exp(log_sum / len(w_values))

	return cai


	def calculate_gc_content(sequence: str) -> float:
	"""Calculate GC content as percentage."""
	if len(sequence) == 0:
	return 0.0
	gc_count = sequence.count('G') + sequence.count('C')
	return (gc_count / len(sequence)) * 100