""" Codon Optimizer - Gradio Web Application Based on GenScript GenSmart algorithm (Patent WO2020024917A1) A multi-objective codon optimization tool using NSGA-III algorithm. """ import gradio as gr from typing import Optional import textwrap from codon_tables import get_organism_list, CODON_TO_AA, AA_TO_CODONS from indices import ( sequence_to_codons, codons_to_protein, calculate_cai, calculate_gc_content, mRNAStructureAnalyzer, SEQFOLD_AVAILABLE ) from optimizer import optimize_sequence, CodonOptimizer, SimpleOptimizer # Restriction enzymes organized by type # Only 6-cutters and above (6bp+ recognition sequences) # Type II: Orthodox restriction enzymes that cut within or near recognition sequence TYPE_II_ENZYMES = { # 6-cutters (palindromic) "AatII": "GACGTC", "AccI": "GTMKAC", # Degenerate: GT(A/C)(G/T)AC "AclI": "AACGTT", "AfeI": "AGCGCT", "AflII": "CTTAAG", "AgeI": "ACCGGT", "AhdI": "GACNNNNNGTC", "AleI": "CACNNNNGTG", "ApaI": "GGGCCC", "ApaLI": "GTGCAC", "AscI": "GGCGCGCC", # 8-cutter "AseI": "ATTAAT", "AsiSI": "GCGATCGC", # 8-cutter "AvrII": "CCTAGG", "BamHI": "GGATCC", "BclI": "TGATCA", "BglII": "AGATCT", "BlpI": "GCTNAGC", "BmtI": "GCTAGC", "BsiWI": "CGTACG", "BspEI": "TCCGGA", "BspHI": "TCATGA", "BsrGI": "TGTACA", "BssHII": "GCGCGC", "BstAPI": "GCANNNNNTGC", "BstBI": "TTCGAA", "BstEII": "GGTNACC", "BstXI": "CCANNNNNNTGG", "BstZ17I": "GTATAC", "Cac8I": "GCNNGC", "ClaI": "ATCGAT", "DraI": "TTTAAA", "DraIII": "CACNNNGTG", "EagI": "CGGCCG", "EcoRI": "GAATTC", "EcoRV": "GATATC", "FseI": "GGCCGGCC", # 8-cutter "FspI": "TGCGCA", "HincII": "GTYRAC", "HindIII": "AAGCTT", "HpaI": "GTTAAC", "KasI": "GGCGCC", "KpnI": "GGTACC", "MfeI": "CAATTG", "MluI": "ACGCGT", "MscI": "TGGCCA", "MspA1I": "CMGCKG", "NaeI": "GCCGGC", "NarI": "GGCGCC", "NcoI": "CCATGG", "NdeI": "CATATG", "NgoMIV": "GCCGGC", "NheI": "GCTAGC", "NotI": "GCGGCCGC", # 8-cutter "NruI": "TCGCGA", "NsiI": "ATGCAT", "NspI": "RCATGY", "PacI": "TTAATTAA", # 8-cutter "PciI": "ACATGT", "PflMI": "CCANNNNNTGG", "PmeI": "GTTTAAAC", # 8-cutter "PmlI": "CACGTG", "PpuMI": "RGGWCCY", "PshAI": "GACNNNNGTC", "PsiI": "TTATAA", "PspOMI": "GGGCCC", "PstI": "CTGCAG", "PvuI": "CGATCG", "PvuII": "CAGCTG", "RsrII": "CGGWCCG", "SacI": "GAGCTC", "SacII": "CCGCGG", "SalI": "GTCGAC", "SbfI": "CCTGCAGG", # 8-cutter "ScaI": "AGTACT", "SexAI": "ACCWGGT", "SfiI": "GGCCNNNNNGGCC", "SfoI": "GGCGCC", "SgrAI": "CRCCGGYG", # 8-cutter "SmaI": "CCCGGG", "SmlI": "CTYRAG", "SnaBI": "TACGTA", "SpeI": "ACTAGT", "SphI": "GCATGC", "SrfI": "GCCCGGGC", # 8-cutter "SspI": "AATATT", "StuI": "AGGCCT", "SwaI": "ATTTAAAT", # 8-cutter "TliI": "CTCGAG", "TspMI": "CCCGGG", "Tth111I": "GACNNNGTC", "XbaI": "TCTAGA", "XcmI": "CCANNNNNNNNNTGG", "XhoI": "CTCGAG", "XmaI": "CCCGGG", "ZraI": "GACGTC", } # Type IIS: Cut outside recognition sequence (6bp+ only) # Used in Golden Gate, MoClo, and other scarless cloning methods TYPE_IIS_ENZYMES = { "AarI": "CACCTGC", # 7bp "BbsI": "GAAGAC", # 6bp - Golden Gate alternative "BfuAI": "ACCTGC", # 6bp "BpiI": "GAAGAC", # 6bp - BbsI isoschizomer "BsaI": "GGTCTC", # 6bp - Golden Gate standard "BsaI-HFv2": "GGTCTC", # 6bp - High-fidelity BsaI "BseRI": "GAGGAG", # 6bp "BsmBI": "CGTCTC", # 6bp - MoClo standard "BspMI": "ACCTGC", # 6bp "BtgZI": "GCGATG", # 6bp "BtsI": "GCAGTG", # 6bp "BspQI": "GCTCTTC", # 7bp - SapI isoschizomer "Esp3I": "CGTCTC", # 6bp - BsmBI isoschizomer "LguI": "GCTCTTC", # 7bp - SapI isoschizomer "PaqCI": "CACCTGC", # 7bp - AarI isoschizomer "SapI": "GCTCTTC", # 7bp - Used in SapTrap } # Type III: Require two recognition sites in inverse orientation (6bp+ only) TYPE_III_ENZYMES = { "EcoP15I": "CAGCAG", # 6bp } # Combine all for backward compatibility COMMON_RESTRICTION_SITES = {**TYPE_II_ENZYMES, **TYPE_IIS_ENZYMES, **TYPE_III_ENZYMES} # Create labeled choices for UI def get_enzyme_choices(): """Get enzyme choices with category labels for display.""" choices = [] # Type II for name in sorted(TYPE_II_ENZYMES.keys()): choices.append(f"[Type II] {name}") # Type IIS for name in sorted(TYPE_IIS_ENZYMES.keys()): choices.append(f"[Type IIS] {name}") # Type III for name in sorted(TYPE_III_ENZYMES.keys()): choices.append(f"[Type III] {name}") return choices def parse_enzyme_name(labeled_name: str) -> str: """Extract enzyme name from labeled choice.""" if "] " in labeled_name: return labeled_name.split("] ")[1] return labeled_name def parse_sequence(sequence: str) -> tuple: """ Parse and validate input sequence. Returns: (cleaned_sequence, is_protein, error_message) """ if not sequence or len(sequence.strip()) == 0: return None, None, "Please enter a sequence" # Clean sequence cleaned = sequence.upper().replace(" ", "").replace("\n", "").replace("\r", "") cleaned = ''.join(c for c in cleaned if c.isalpha()) if len(cleaned) == 0: return None, None, "No valid characters found in sequence" # Detect if protein or DNA dna_chars = set('ATGC') protein_chars = set('ACDEFGHIKLMNPQRSTVWY') unique_chars = set(cleaned) # If only ATGC, likely DNA if unique_chars.issubset(dna_chars): # Could be DNA or protein with limited amino acids # Check length - if divisible by 3 and reasonably long, assume DNA if len(cleaned) >= 30 and len(cleaned) % 3 == 0: return cleaned, False, None # DNA elif len(cleaned) < 30: # Short sequence - could be either return cleaned, True, None # Assume protein for short sequences # If has characters outside ATGC, must be protein if not unique_chars.issubset(dna_chars): if unique_chars.issubset(protein_chars): return cleaned, True, None # Protein else: invalid = unique_chars - protein_chars return None, None, f"Invalid characters found: {invalid}" return cleaned, False, None # Default to DNA def format_sequence(sequence: str, line_length: int = 60) -> str: """Format sequence with line breaks for display.""" return '\n'.join(textwrap.wrap(sequence, line_length)) def get_excluded_sites(site_names: list) -> list: """Convert site names to sequences.""" sites = [] for name in site_names or []: # Handle labeled names like "[Type II] EcoRI" enzyme_name = parse_enzyme_name(name) if enzyme_name in COMMON_RESTRICTION_SITES: sites.append(COMMON_RESTRICTION_SITES[enzyme_name]) return sites def optimize_codon( input_sequence: str, sequence_type: str, organism: str, type_ii_sites: list, type_iis_sites: list, type_iii_sites: list, optimization_quality: str, ) -> tuple: """ Main optimization function for Gradio interface. """ # Parse sequence cleaned, auto_is_protein, error = parse_sequence(input_sequence) if error: return error, "", "", "", "" # Determine sequence type if sequence_type == "Auto-detect": is_protein = auto_is_protein else: is_protein = (sequence_type == "Protein") # Combine all excluded restriction sites all_excluded_sites = (type_ii_sites or []) + (type_iis_sites or []) + (type_iii_sites or []) excluded = get_excluded_sites(all_excluded_sites) # Map quality quality_map = { "Fast": "fast", "Standard": "standard", "Thorough": "thorough", } quality = quality_map.get(optimization_quality, "standard") try: # Run optimization result = optimize_sequence( sequence=cleaned, organism=organism, is_protein=is_protein, excluded_sites=excluded, use_nsga3=False, # Use fast hill-climbing optimizer quality=quality ) # Format outputs input_info = f"""**Input Analysis:** - Detected as: {'Protein' if is_protein else 'DNA'} - {'Amino acids' if is_protein else 'Nucleotides'}: {len(cleaned)} - Target organism: {organism} """ optimized_dna = format_sequence(result['optimized_dna']) protein_seq = format_sequence(result['protein']) metrics = result['metrics'] # Perform mRNA structure analysis mrna_analyzer = mRNAStructureAnalyzer() mrna_analysis = mrna_analyzer.analyze(result['optimized_dna']) metrics_text = f"""**Optimization Metrics:** | Metric | Value | |--------|-------| | Codon Adaptation Index (CAI) | {metrics['cai']:.3f} | | Harmony Index | {metrics['harmony_index']:.3f} | | Codon Context Index | {metrics['context_index']:.3f} | | Outlier Index | {metrics['outlier_index']:.3f} | | GC Content | {metrics['gc_content']:.1f}% | | Sequence Length | {metrics['length_bp']} bp ({metrics['length_aa']} aa) | """ # Add mRNA structure metrics if available if mrna_analysis['available']: five_prime_status = "Good" if mrna_analysis['five_prime_mfe'] > -30 else "Warning: stable structure" metrics_text += f"""**mRNA Secondary Structure:** | Metric | Value | Status | |--------|-------|--------| | 5' Region MFE (50 nt) | {mrna_analysis['five_prime_mfe']:.1f} kcal/mol | {five_prime_status} | | Full Sequence MFE | {mrna_analysis['full_mfe']:.1f} kcal/mol | - | | Hairpins Detected | {mrna_analysis['hairpin_count']} | {"None" if mrna_analysis['hairpin_count'] == 0 else "Review recommended"} | """ else: metrics_text += "*mRNA structure analysis not available (seqfold not installed)*\n\n" metrics_text += """**Interpretation:** - CAI: Higher is better (1.0 = perfect match to highly expressed genes) - Harmony Index: Higher is better (codon usage match) - Context Index: Higher is better (codon pair optimization) - Outlier Index: Lower is better (fewer adverse features) - GC Content: Optimal range is 40-60% - 5' MFE: > -30 kcal/mol recommended (less stable = better translation initiation) """ # Generate codon comparison if input was DNA if not is_protein: original_codons = sequence_to_codons(cleaned) optimized_codons = result['codons'] changes = 0 for i, (orig, opt) in enumerate(zip(original_codons, optimized_codons)): if orig != opt: changes += 1 comparison = f"\n**Codon Changes:** {changes} of {len(original_codons)} codons modified ({100*changes/len(original_codons):.1f}%)" metrics_text += comparison return input_info, optimized_dna, protein_seq, metrics_text, "" except Exception as e: return f"Error: {str(e)}", "", "", "", str(e) def download_fasta(optimized_dna: str, organism: str) -> str: """Generate FASTA format for download.""" if not optimized_dna: return "" clean_dna = optimized_dna.replace('\n', '').replace(' ', '') organism_short = organism.replace(' ', '_').replace('(', '').replace(')', '') fasta = f">Optimized_sequence|{organism_short}|{len(clean_dna)}bp\n" fasta += '\n'.join(textwrap.wrap(clean_dna, 60)) return fasta # Example sequences EXAMPLE_PROTEIN = """MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL VTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK""" EXAMPLE_DNA = """ATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCAAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCTGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCATGGCAGACAAACAAAAGAATGGAATCAAAGTTAACTTCAAAATTAGACACAACATTGAAGATGGAAGCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGAGAGACCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAA""" # Build Gradio interface with gr.Blocks( title="Codon Optimizer", theme=gr.themes.Soft(), ) as demo: gr.Markdown(""" # Codon Optimizer `joeyisgoed/codon-optimizer` Multi-objective codon optimization tool based on the GenScript GenSmart algorithm. Uses NSGA-III genetic algorithm to optimize for: - **Harmony Index**: Match codon usage to highly-expressed genes - **Codon Context Index**: Optimize codon pair preferences - **Outlier Index**: Minimize adverse sequence features Enter a protein or DNA sequence below to optimize it for your target expression host. """) with gr.Row(): with gr.Column(scale=1): input_sequence = gr.Textbox( label="Input Sequence", placeholder="Paste your protein or DNA sequence here...", lines=8, max_lines=20, ) with gr.Row(): sequence_type = gr.Radio( choices=["Auto-detect", "Protein", "DNA"], value="Auto-detect", label="Sequence Type", ) organism = gr.Dropdown( choices=get_organism_list(), value="Escherichia coli K12", label="Target Organism", ) with gr.Accordion("Exclude Restriction Sites", open=False): gr.Markdown("*Select restriction enzyme sites to avoid in the optimized sequence*") with gr.Accordion("Type II Enzymes (Standard)", open=False): type_ii_sites = gr.CheckboxGroup( choices=[f"[Type II] {name}" for name in sorted(TYPE_II_ENZYMES.keys())], label="Type II", info="Orthodox enzymes that cut within recognition sequence", ) with gr.Accordion("Type IIS Enzymes (Golden Gate/MoClo)", open=True): type_iis_sites = gr.CheckboxGroup( choices=[f"[Type IIS] {name}" for name in sorted(TYPE_IIS_ENZYMES.keys())], label="Type IIS", info="Cut outside recognition site - used for scarless cloning", ) with gr.Accordion("Type III Enzymes", open=False): type_iii_sites = gr.CheckboxGroup( choices=[f"[Type III] {name}" for name in sorted(TYPE_III_ENZYMES.keys())], label="Type III", info="Require two recognition sites in inverse orientation", ) optimization_quality = gr.Radio( choices=[ "Fast", "Standard", "Thorough", ], value="Fast", # Default to fast for better UX label="Optimization Quality", ) with gr.Row(): optimize_btn = gr.Button("Optimize Sequence", variant="primary", size="lg") clear_btn = gr.Button("Clear", size="lg") with gr.Accordion("Example Sequences", open=False): example_protein_btn = gr.Button("Load GFP Protein") example_dna_btn = gr.Button("Load GFP DNA") with gr.Column(scale=1): input_info = gr.Markdown(label="Input Analysis") optimized_dna = gr.Textbox( label="Optimized DNA Sequence", lines=8, max_lines=20, show_copy_button=True, ) protein_output = gr.Textbox( label="Protein Sequence", lines=4, max_lines=10, show_copy_button=True, ) metrics_output = gr.Markdown(label="Optimization Metrics") error_output = gr.Textbox(label="Errors", visible=False) # Event handlers optimize_btn.click( fn=optimize_codon, inputs=[input_sequence, sequence_type, organism, type_ii_sites, type_iis_sites, type_iii_sites, optimization_quality], outputs=[input_info, optimized_dna, protein_output, metrics_output, error_output], ) clear_btn.click( fn=lambda: ("", "", "", "", ""), outputs=[input_info, optimized_dna, protein_output, metrics_output, error_output], ) example_protein_btn.click( fn=lambda: EXAMPLE_PROTEIN.replace('\n', ''), outputs=[input_sequence], ) example_dna_btn.click( fn=lambda: EXAMPLE_DNA.replace('\n', ''), outputs=[input_sequence], ) gr.Markdown(""" --- ### About This tool implements a codon optimization algorithm inspired by GenScript's GenSmart system ([Patent WO2020024917A1](https://patents.google.com/patent/WO2020024917A1/en)). **Features:** - Multi-objective optimization using NSGA-III algorithm - Support for 10 common expression hosts - Optional restriction site exclusion - Comprehensive optimization metrics **References:** - GenScript GenSmart Codon Optimization - NSGA-III: Deb & Jain (2014) - Codon Adaptation Index (CAI): Sharp & Li (1987) """) if __name__ == "__main__": demo.launch()