codon-optimizer / app.py
“JoeyRiepsaame”
Add space identifier to UI header
7595b98
"""
Codon Optimizer - Gradio Web Application
Based on GenScript GenSmart algorithm (Patent WO2020024917A1)
A multi-objective codon optimization tool using NSGA-III algorithm.
"""
import gradio as gr
from typing import Optional
import textwrap
from codon_tables import get_organism_list, CODON_TO_AA, AA_TO_CODONS
from indices import (
sequence_to_codons, codons_to_protein, calculate_cai, calculate_gc_content,
mRNAStructureAnalyzer, SEQFOLD_AVAILABLE
)
from optimizer import optimize_sequence, CodonOptimizer, SimpleOptimizer
# Restriction enzymes organized by type
# Only 6-cutters and above (6bp+ recognition sequences)
# Type II: Orthodox restriction enzymes that cut within or near recognition sequence
TYPE_II_ENZYMES = {
# 6-cutters (palindromic)
"AatII": "GACGTC",
"AccI": "GTMKAC", # Degenerate: GT(A/C)(G/T)AC
"AclI": "AACGTT",
"AfeI": "AGCGCT",
"AflII": "CTTAAG",
"AgeI": "ACCGGT",
"AhdI": "GACNNNNNGTC",
"AleI": "CACNNNNGTG",
"ApaI": "GGGCCC",
"ApaLI": "GTGCAC",
"AscI": "GGCGCGCC", # 8-cutter
"AseI": "ATTAAT",
"AsiSI": "GCGATCGC", # 8-cutter
"AvrII": "CCTAGG",
"BamHI": "GGATCC",
"BclI": "TGATCA",
"BglII": "AGATCT",
"BlpI": "GCTNAGC",
"BmtI": "GCTAGC",
"BsiWI": "CGTACG",
"BspEI": "TCCGGA",
"BspHI": "TCATGA",
"BsrGI": "TGTACA",
"BssHII": "GCGCGC",
"BstAPI": "GCANNNNNTGC",
"BstBI": "TTCGAA",
"BstEII": "GGTNACC",
"BstXI": "CCANNNNNNTGG",
"BstZ17I": "GTATAC",
"Cac8I": "GCNNGC",
"ClaI": "ATCGAT",
"DraI": "TTTAAA",
"DraIII": "CACNNNGTG",
"EagI": "CGGCCG",
"EcoRI": "GAATTC",
"EcoRV": "GATATC",
"FseI": "GGCCGGCC", # 8-cutter
"FspI": "TGCGCA",
"HincII": "GTYRAC",
"HindIII": "AAGCTT",
"HpaI": "GTTAAC",
"KasI": "GGCGCC",
"KpnI": "GGTACC",
"MfeI": "CAATTG",
"MluI": "ACGCGT",
"MscI": "TGGCCA",
"MspA1I": "CMGCKG",
"NaeI": "GCCGGC",
"NarI": "GGCGCC",
"NcoI": "CCATGG",
"NdeI": "CATATG",
"NgoMIV": "GCCGGC",
"NheI": "GCTAGC",
"NotI": "GCGGCCGC", # 8-cutter
"NruI": "TCGCGA",
"NsiI": "ATGCAT",
"NspI": "RCATGY",
"PacI": "TTAATTAA", # 8-cutter
"PciI": "ACATGT",
"PflMI": "CCANNNNNTGG",
"PmeI": "GTTTAAAC", # 8-cutter
"PmlI": "CACGTG",
"PpuMI": "RGGWCCY",
"PshAI": "GACNNNNGTC",
"PsiI": "TTATAA",
"PspOMI": "GGGCCC",
"PstI": "CTGCAG",
"PvuI": "CGATCG",
"PvuII": "CAGCTG",
"RsrII": "CGGWCCG",
"SacI": "GAGCTC",
"SacII": "CCGCGG",
"SalI": "GTCGAC",
"SbfI": "CCTGCAGG", # 8-cutter
"ScaI": "AGTACT",
"SexAI": "ACCWGGT",
"SfiI": "GGCCNNNNNGGCC",
"SfoI": "GGCGCC",
"SgrAI": "CRCCGGYG", # 8-cutter
"SmaI": "CCCGGG",
"SmlI": "CTYRAG",
"SnaBI": "TACGTA",
"SpeI": "ACTAGT",
"SphI": "GCATGC",
"SrfI": "GCCCGGGC", # 8-cutter
"SspI": "AATATT",
"StuI": "AGGCCT",
"SwaI": "ATTTAAAT", # 8-cutter
"TliI": "CTCGAG",
"TspMI": "CCCGGG",
"Tth111I": "GACNNNGTC",
"XbaI": "TCTAGA",
"XcmI": "CCANNNNNNNNNTGG",
"XhoI": "CTCGAG",
"XmaI": "CCCGGG",
"ZraI": "GACGTC",
}
# Type IIS: Cut outside recognition sequence (6bp+ only)
# Used in Golden Gate, MoClo, and other scarless cloning methods
TYPE_IIS_ENZYMES = {
"AarI": "CACCTGC", # 7bp
"BbsI": "GAAGAC", # 6bp - Golden Gate alternative
"BfuAI": "ACCTGC", # 6bp
"BpiI": "GAAGAC", # 6bp - BbsI isoschizomer
"BsaI": "GGTCTC", # 6bp - Golden Gate standard
"BsaI-HFv2": "GGTCTC", # 6bp - High-fidelity BsaI
"BseRI": "GAGGAG", # 6bp
"BsmBI": "CGTCTC", # 6bp - MoClo standard
"BspMI": "ACCTGC", # 6bp
"BtgZI": "GCGATG", # 6bp
"BtsI": "GCAGTG", # 6bp
"BspQI": "GCTCTTC", # 7bp - SapI isoschizomer
"Esp3I": "CGTCTC", # 6bp - BsmBI isoschizomer
"LguI": "GCTCTTC", # 7bp - SapI isoschizomer
"PaqCI": "CACCTGC", # 7bp - AarI isoschizomer
"SapI": "GCTCTTC", # 7bp - Used in SapTrap
}
# Type III: Require two recognition sites in inverse orientation (6bp+ only)
TYPE_III_ENZYMES = {
"EcoP15I": "CAGCAG", # 6bp
}
# Combine all for backward compatibility
COMMON_RESTRICTION_SITES = {**TYPE_II_ENZYMES, **TYPE_IIS_ENZYMES, **TYPE_III_ENZYMES}
# Create labeled choices for UI
def get_enzyme_choices():
"""Get enzyme choices with category labels for display."""
choices = []
# Type II
for name in sorted(TYPE_II_ENZYMES.keys()):
choices.append(f"[Type II] {name}")
# Type IIS
for name in sorted(TYPE_IIS_ENZYMES.keys()):
choices.append(f"[Type IIS] {name}")
# Type III
for name in sorted(TYPE_III_ENZYMES.keys()):
choices.append(f"[Type III] {name}")
return choices
def parse_enzyme_name(labeled_name: str) -> str:
"""Extract enzyme name from labeled choice."""
if "] " in labeled_name:
return labeled_name.split("] ")[1]
return labeled_name
def parse_sequence(sequence: str) -> tuple:
"""
Parse and validate input sequence.
Returns: (cleaned_sequence, is_protein, error_message)
"""
if not sequence or len(sequence.strip()) == 0:
return None, None, "Please enter a sequence"
# Clean sequence
cleaned = sequence.upper().replace(" ", "").replace("\n", "").replace("\r", "")
cleaned = ''.join(c for c in cleaned if c.isalpha())
if len(cleaned) == 0:
return None, None, "No valid characters found in sequence"
# Detect if protein or DNA
dna_chars = set('ATGC')
protein_chars = set('ACDEFGHIKLMNPQRSTVWY')
unique_chars = set(cleaned)
# If only ATGC, likely DNA
if unique_chars.issubset(dna_chars):
# Could be DNA or protein with limited amino acids
# Check length - if divisible by 3 and reasonably long, assume DNA
if len(cleaned) >= 30 and len(cleaned) % 3 == 0:
return cleaned, False, None # DNA
elif len(cleaned) < 30:
# Short sequence - could be either
return cleaned, True, None # Assume protein for short sequences
# If has characters outside ATGC, must be protein
if not unique_chars.issubset(dna_chars):
if unique_chars.issubset(protein_chars):
return cleaned, True, None # Protein
else:
invalid = unique_chars - protein_chars
return None, None, f"Invalid characters found: {invalid}"
return cleaned, False, None # Default to DNA
def format_sequence(sequence: str, line_length: int = 60) -> str:
"""Format sequence with line breaks for display."""
return '\n'.join(textwrap.wrap(sequence, line_length))
def get_excluded_sites(site_names: list) -> list:
"""Convert site names to sequences."""
sites = []
for name in site_names or []:
# Handle labeled names like "[Type II] EcoRI"
enzyme_name = parse_enzyme_name(name)
if enzyme_name in COMMON_RESTRICTION_SITES:
sites.append(COMMON_RESTRICTION_SITES[enzyme_name])
return sites
def optimize_codon(
input_sequence: str,
sequence_type: str,
organism: str,
type_ii_sites: list,
type_iis_sites: list,
type_iii_sites: list,
optimization_quality: str,
) -> tuple:
"""
Main optimization function for Gradio interface.
"""
# Parse sequence
cleaned, auto_is_protein, error = parse_sequence(input_sequence)
if error:
return error, "", "", "", ""
# Determine sequence type
if sequence_type == "Auto-detect":
is_protein = auto_is_protein
else:
is_protein = (sequence_type == "Protein")
# Combine all excluded restriction sites
all_excluded_sites = (type_ii_sites or []) + (type_iis_sites or []) + (type_iii_sites or [])
excluded = get_excluded_sites(all_excluded_sites)
# Map quality
quality_map = {
"Fast": "fast",
"Standard": "standard",
"Thorough": "thorough",
}
quality = quality_map.get(optimization_quality, "standard")
try:
# Run optimization
result = optimize_sequence(
sequence=cleaned,
organism=organism,
is_protein=is_protein,
excluded_sites=excluded,
use_nsga3=False, # Use fast hill-climbing optimizer
quality=quality
)
# Format outputs
input_info = f"""**Input Analysis:**
- Detected as: {'Protein' if is_protein else 'DNA'}
- {'Amino acids' if is_protein else 'Nucleotides'}: {len(cleaned)}
- Target organism: {organism}
"""
optimized_dna = format_sequence(result['optimized_dna'])
protein_seq = format_sequence(result['protein'])
metrics = result['metrics']
# Perform mRNA structure analysis
mrna_analyzer = mRNAStructureAnalyzer()
mrna_analysis = mrna_analyzer.analyze(result['optimized_dna'])
metrics_text = f"""**Optimization Metrics:**
| Metric | Value |
|--------|-------|
| Codon Adaptation Index (CAI) | {metrics['cai']:.3f} |
| Harmony Index | {metrics['harmony_index']:.3f} |
| Codon Context Index | {metrics['context_index']:.3f} |
| Outlier Index | {metrics['outlier_index']:.3f} |
| GC Content | {metrics['gc_content']:.1f}% |
| Sequence Length | {metrics['length_bp']} bp ({metrics['length_aa']} aa) |
"""
# Add mRNA structure metrics if available
if mrna_analysis['available']:
five_prime_status = "Good" if mrna_analysis['five_prime_mfe'] > -30 else "Warning: stable structure"
metrics_text += f"""**mRNA Secondary Structure:**
| Metric | Value | Status |
|--------|-------|--------|
| 5' Region MFE (50 nt) | {mrna_analysis['five_prime_mfe']:.1f} kcal/mol | {five_prime_status} |
| Full Sequence MFE | {mrna_analysis['full_mfe']:.1f} kcal/mol | - |
| Hairpins Detected | {mrna_analysis['hairpin_count']} | {"None" if mrna_analysis['hairpin_count'] == 0 else "Review recommended"} |
"""
else:
metrics_text += "*mRNA structure analysis not available (seqfold not installed)*\n\n"
metrics_text += """**Interpretation:**
- CAI: Higher is better (1.0 = perfect match to highly expressed genes)
- Harmony Index: Higher is better (codon usage match)
- Context Index: Higher is better (codon pair optimization)
- Outlier Index: Lower is better (fewer adverse features)
- GC Content: Optimal range is 40-60%
- 5' MFE: > -30 kcal/mol recommended (less stable = better translation initiation)
"""
# Generate codon comparison if input was DNA
if not is_protein:
original_codons = sequence_to_codons(cleaned)
optimized_codons = result['codons']
changes = 0
for i, (orig, opt) in enumerate(zip(original_codons, optimized_codons)):
if orig != opt:
changes += 1
comparison = f"\n**Codon Changes:** {changes} of {len(original_codons)} codons modified ({100*changes/len(original_codons):.1f}%)"
metrics_text += comparison
return input_info, optimized_dna, protein_seq, metrics_text, ""
except Exception as e:
return f"Error: {str(e)}", "", "", "", str(e)
def download_fasta(optimized_dna: str, organism: str) -> str:
"""Generate FASTA format for download."""
if not optimized_dna:
return ""
clean_dna = optimized_dna.replace('\n', '').replace(' ', '')
organism_short = organism.replace(' ', '_').replace('(', '').replace(')', '')
fasta = f">Optimized_sequence|{organism_short}|{len(clean_dna)}bp\n"
fasta += '\n'.join(textwrap.wrap(clean_dna, 60))
return fasta
# Example sequences
EXAMPLE_PROTEIN = """MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL
VTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV
NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD
HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"""
EXAMPLE_DNA = """ATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCAAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCTGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCATGGCAGACAAACAAAAGAATGGAATCAAAGTTAACTTCAAAATTAGACACAACATTGAAGATGGAAGCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGAGAGACCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAA"""
# Build Gradio interface
with gr.Blocks(
title="Codon Optimizer",
theme=gr.themes.Soft(),
) as demo:
gr.Markdown("""
# Codon Optimizer
`joeyisgoed/codon-optimizer`
Multi-objective codon optimization tool based on the GenScript GenSmart algorithm.
Uses NSGA-III genetic algorithm to optimize for:
- **Harmony Index**: Match codon usage to highly-expressed genes
- **Codon Context Index**: Optimize codon pair preferences
- **Outlier Index**: Minimize adverse sequence features
Enter a protein or DNA sequence below to optimize it for your target expression host.
""")
with gr.Row():
with gr.Column(scale=1):
input_sequence = gr.Textbox(
label="Input Sequence",
placeholder="Paste your protein or DNA sequence here...",
lines=8,
max_lines=20,
)
with gr.Row():
sequence_type = gr.Radio(
choices=["Auto-detect", "Protein", "DNA"],
value="Auto-detect",
label="Sequence Type",
)
organism = gr.Dropdown(
choices=get_organism_list(),
value="Escherichia coli K12",
label="Target Organism",
)
with gr.Accordion("Exclude Restriction Sites", open=False):
gr.Markdown("*Select restriction enzyme sites to avoid in the optimized sequence*")
with gr.Accordion("Type II Enzymes (Standard)", open=False):
type_ii_sites = gr.CheckboxGroup(
choices=[f"[Type II] {name}" for name in sorted(TYPE_II_ENZYMES.keys())],
label="Type II",
info="Orthodox enzymes that cut within recognition sequence",
)
with gr.Accordion("Type IIS Enzymes (Golden Gate/MoClo)", open=True):
type_iis_sites = gr.CheckboxGroup(
choices=[f"[Type IIS] {name}" for name in sorted(TYPE_IIS_ENZYMES.keys())],
label="Type IIS",
info="Cut outside recognition site - used for scarless cloning",
)
with gr.Accordion("Type III Enzymes", open=False):
type_iii_sites = gr.CheckboxGroup(
choices=[f"[Type III] {name}" for name in sorted(TYPE_III_ENZYMES.keys())],
label="Type III",
info="Require two recognition sites in inverse orientation",
)
optimization_quality = gr.Radio(
choices=[
"Fast",
"Standard",
"Thorough",
],
value="Fast", # Default to fast for better UX
label="Optimization Quality",
)
with gr.Row():
optimize_btn = gr.Button("Optimize Sequence", variant="primary", size="lg")
clear_btn = gr.Button("Clear", size="lg")
with gr.Accordion("Example Sequences", open=False):
example_protein_btn = gr.Button("Load GFP Protein")
example_dna_btn = gr.Button("Load GFP DNA")
with gr.Column(scale=1):
input_info = gr.Markdown(label="Input Analysis")
optimized_dna = gr.Textbox(
label="Optimized DNA Sequence",
lines=8,
max_lines=20,
show_copy_button=True,
)
protein_output = gr.Textbox(
label="Protein Sequence",
lines=4,
max_lines=10,
show_copy_button=True,
)
metrics_output = gr.Markdown(label="Optimization Metrics")
error_output = gr.Textbox(label="Errors", visible=False)
# Event handlers
optimize_btn.click(
fn=optimize_codon,
inputs=[input_sequence, sequence_type, organism, type_ii_sites, type_iis_sites, type_iii_sites, optimization_quality],
outputs=[input_info, optimized_dna, protein_output, metrics_output, error_output],
)
clear_btn.click(
fn=lambda: ("", "", "", "", ""),
outputs=[input_info, optimized_dna, protein_output, metrics_output, error_output],
)
example_protein_btn.click(
fn=lambda: EXAMPLE_PROTEIN.replace('\n', ''),
outputs=[input_sequence],
)
example_dna_btn.click(
fn=lambda: EXAMPLE_DNA.replace('\n', ''),
outputs=[input_sequence],
)
gr.Markdown("""
---
### About
This tool implements a codon optimization algorithm inspired by GenScript's GenSmart system
([Patent WO2020024917A1](https://patents.google.com/patent/WO2020024917A1/en)).
**Features:**
- Multi-objective optimization using NSGA-III algorithm
- Support for 10 common expression hosts
- Optional restriction site exclusion
- Comprehensive optimization metrics
**References:**
- GenScript GenSmart Codon Optimization
- NSGA-III: Deb & Jain (2014)
- Codon Adaptation Index (CAI): Sharp & Li (1987)
""")
if __name__ == "__main__":
demo.launch()