Spaces:
Sleeping
Sleeping
| """ | |
| Codon Optimizer - Gradio Web Application | |
| Based on GenScript GenSmart algorithm (Patent WO2020024917A1) | |
| A multi-objective codon optimization tool using NSGA-III algorithm. | |
| """ | |
| import gradio as gr | |
| from typing import Optional | |
| import textwrap | |
| from codon_tables import get_organism_list, CODON_TO_AA, AA_TO_CODONS | |
| from indices import ( | |
| sequence_to_codons, codons_to_protein, calculate_cai, calculate_gc_content, | |
| mRNAStructureAnalyzer, SEQFOLD_AVAILABLE | |
| ) | |
| from optimizer import optimize_sequence, CodonOptimizer, SimpleOptimizer | |
| # Restriction enzymes organized by type | |
| # Only 6-cutters and above (6bp+ recognition sequences) | |
| # Type II: Orthodox restriction enzymes that cut within or near recognition sequence | |
| TYPE_II_ENZYMES = { | |
| # 6-cutters (palindromic) | |
| "AatII": "GACGTC", | |
| "AccI": "GTMKAC", # Degenerate: GT(A/C)(G/T)AC | |
| "AclI": "AACGTT", | |
| "AfeI": "AGCGCT", | |
| "AflII": "CTTAAG", | |
| "AgeI": "ACCGGT", | |
| "AhdI": "GACNNNNNGTC", | |
| "AleI": "CACNNNNGTG", | |
| "ApaI": "GGGCCC", | |
| "ApaLI": "GTGCAC", | |
| "AscI": "GGCGCGCC", # 8-cutter | |
| "AseI": "ATTAAT", | |
| "AsiSI": "GCGATCGC", # 8-cutter | |
| "AvrII": "CCTAGG", | |
| "BamHI": "GGATCC", | |
| "BclI": "TGATCA", | |
| "BglII": "AGATCT", | |
| "BlpI": "GCTNAGC", | |
| "BmtI": "GCTAGC", | |
| "BsiWI": "CGTACG", | |
| "BspEI": "TCCGGA", | |
| "BspHI": "TCATGA", | |
| "BsrGI": "TGTACA", | |
| "BssHII": "GCGCGC", | |
| "BstAPI": "GCANNNNNTGC", | |
| "BstBI": "TTCGAA", | |
| "BstEII": "GGTNACC", | |
| "BstXI": "CCANNNNNNTGG", | |
| "BstZ17I": "GTATAC", | |
| "Cac8I": "GCNNGC", | |
| "ClaI": "ATCGAT", | |
| "DraI": "TTTAAA", | |
| "DraIII": "CACNNNGTG", | |
| "EagI": "CGGCCG", | |
| "EcoRI": "GAATTC", | |
| "EcoRV": "GATATC", | |
| "FseI": "GGCCGGCC", # 8-cutter | |
| "FspI": "TGCGCA", | |
| "HincII": "GTYRAC", | |
| "HindIII": "AAGCTT", | |
| "HpaI": "GTTAAC", | |
| "KasI": "GGCGCC", | |
| "KpnI": "GGTACC", | |
| "MfeI": "CAATTG", | |
| "MluI": "ACGCGT", | |
| "MscI": "TGGCCA", | |
| "MspA1I": "CMGCKG", | |
| "NaeI": "GCCGGC", | |
| "NarI": "GGCGCC", | |
| "NcoI": "CCATGG", | |
| "NdeI": "CATATG", | |
| "NgoMIV": "GCCGGC", | |
| "NheI": "GCTAGC", | |
| "NotI": "GCGGCCGC", # 8-cutter | |
| "NruI": "TCGCGA", | |
| "NsiI": "ATGCAT", | |
| "NspI": "RCATGY", | |
| "PacI": "TTAATTAA", # 8-cutter | |
| "PciI": "ACATGT", | |
| "PflMI": "CCANNNNNTGG", | |
| "PmeI": "GTTTAAAC", # 8-cutter | |
| "PmlI": "CACGTG", | |
| "PpuMI": "RGGWCCY", | |
| "PshAI": "GACNNNNGTC", | |
| "PsiI": "TTATAA", | |
| "PspOMI": "GGGCCC", | |
| "PstI": "CTGCAG", | |
| "PvuI": "CGATCG", | |
| "PvuII": "CAGCTG", | |
| "RsrII": "CGGWCCG", | |
| "SacI": "GAGCTC", | |
| "SacII": "CCGCGG", | |
| "SalI": "GTCGAC", | |
| "SbfI": "CCTGCAGG", # 8-cutter | |
| "ScaI": "AGTACT", | |
| "SexAI": "ACCWGGT", | |
| "SfiI": "GGCCNNNNNGGCC", | |
| "SfoI": "GGCGCC", | |
| "SgrAI": "CRCCGGYG", # 8-cutter | |
| "SmaI": "CCCGGG", | |
| "SmlI": "CTYRAG", | |
| "SnaBI": "TACGTA", | |
| "SpeI": "ACTAGT", | |
| "SphI": "GCATGC", | |
| "SrfI": "GCCCGGGC", # 8-cutter | |
| "SspI": "AATATT", | |
| "StuI": "AGGCCT", | |
| "SwaI": "ATTTAAAT", # 8-cutter | |
| "TliI": "CTCGAG", | |
| "TspMI": "CCCGGG", | |
| "Tth111I": "GACNNNGTC", | |
| "XbaI": "TCTAGA", | |
| "XcmI": "CCANNNNNNNNNTGG", | |
| "XhoI": "CTCGAG", | |
| "XmaI": "CCCGGG", | |
| "ZraI": "GACGTC", | |
| } | |
| # Type IIS: Cut outside recognition sequence (6bp+ only) | |
| # Used in Golden Gate, MoClo, and other scarless cloning methods | |
| TYPE_IIS_ENZYMES = { | |
| "AarI": "CACCTGC", # 7bp | |
| "BbsI": "GAAGAC", # 6bp - Golden Gate alternative | |
| "BfuAI": "ACCTGC", # 6bp | |
| "BpiI": "GAAGAC", # 6bp - BbsI isoschizomer | |
| "BsaI": "GGTCTC", # 6bp - Golden Gate standard | |
| "BsaI-HFv2": "GGTCTC", # 6bp - High-fidelity BsaI | |
| "BseRI": "GAGGAG", # 6bp | |
| "BsmBI": "CGTCTC", # 6bp - MoClo standard | |
| "BspMI": "ACCTGC", # 6bp | |
| "BtgZI": "GCGATG", # 6bp | |
| "BtsI": "GCAGTG", # 6bp | |
| "BspQI": "GCTCTTC", # 7bp - SapI isoschizomer | |
| "Esp3I": "CGTCTC", # 6bp - BsmBI isoschizomer | |
| "LguI": "GCTCTTC", # 7bp - SapI isoschizomer | |
| "PaqCI": "CACCTGC", # 7bp - AarI isoschizomer | |
| "SapI": "GCTCTTC", # 7bp - Used in SapTrap | |
| } | |
| # Type III: Require two recognition sites in inverse orientation (6bp+ only) | |
| TYPE_III_ENZYMES = { | |
| "EcoP15I": "CAGCAG", # 6bp | |
| } | |
| # Combine all for backward compatibility | |
| COMMON_RESTRICTION_SITES = {**TYPE_II_ENZYMES, **TYPE_IIS_ENZYMES, **TYPE_III_ENZYMES} | |
| # Create labeled choices for UI | |
| def get_enzyme_choices(): | |
| """Get enzyme choices with category labels for display.""" | |
| choices = [] | |
| # Type II | |
| for name in sorted(TYPE_II_ENZYMES.keys()): | |
| choices.append(f"[Type II] {name}") | |
| # Type IIS | |
| for name in sorted(TYPE_IIS_ENZYMES.keys()): | |
| choices.append(f"[Type IIS] {name}") | |
| # Type III | |
| for name in sorted(TYPE_III_ENZYMES.keys()): | |
| choices.append(f"[Type III] {name}") | |
| return choices | |
| def parse_enzyme_name(labeled_name: str) -> str: | |
| """Extract enzyme name from labeled choice.""" | |
| if "] " in labeled_name: | |
| return labeled_name.split("] ")[1] | |
| return labeled_name | |
| def parse_sequence(sequence: str) -> tuple: | |
| """ | |
| Parse and validate input sequence. | |
| Returns: (cleaned_sequence, is_protein, error_message) | |
| """ | |
| if not sequence or len(sequence.strip()) == 0: | |
| return None, None, "Please enter a sequence" | |
| # Clean sequence | |
| cleaned = sequence.upper().replace(" ", "").replace("\n", "").replace("\r", "") | |
| cleaned = ''.join(c for c in cleaned if c.isalpha()) | |
| if len(cleaned) == 0: | |
| return None, None, "No valid characters found in sequence" | |
| # Detect if protein or DNA | |
| dna_chars = set('ATGC') | |
| protein_chars = set('ACDEFGHIKLMNPQRSTVWY') | |
| unique_chars = set(cleaned) | |
| # If only ATGC, likely DNA | |
| if unique_chars.issubset(dna_chars): | |
| # Could be DNA or protein with limited amino acids | |
| # Check length - if divisible by 3 and reasonably long, assume DNA | |
| if len(cleaned) >= 30 and len(cleaned) % 3 == 0: | |
| return cleaned, False, None # DNA | |
| elif len(cleaned) < 30: | |
| # Short sequence - could be either | |
| return cleaned, True, None # Assume protein for short sequences | |
| # If has characters outside ATGC, must be protein | |
| if not unique_chars.issubset(dna_chars): | |
| if unique_chars.issubset(protein_chars): | |
| return cleaned, True, None # Protein | |
| else: | |
| invalid = unique_chars - protein_chars | |
| return None, None, f"Invalid characters found: {invalid}" | |
| return cleaned, False, None # Default to DNA | |
| def format_sequence(sequence: str, line_length: int = 60) -> str: | |
| """Format sequence with line breaks for display.""" | |
| return '\n'.join(textwrap.wrap(sequence, line_length)) | |
| def get_excluded_sites(site_names: list) -> list: | |
| """Convert site names to sequences.""" | |
| sites = [] | |
| for name in site_names or []: | |
| # Handle labeled names like "[Type II] EcoRI" | |
| enzyme_name = parse_enzyme_name(name) | |
| if enzyme_name in COMMON_RESTRICTION_SITES: | |
| sites.append(COMMON_RESTRICTION_SITES[enzyme_name]) | |
| return sites | |
| def optimize_codon( | |
| input_sequence: str, | |
| sequence_type: str, | |
| organism: str, | |
| type_ii_sites: list, | |
| type_iis_sites: list, | |
| type_iii_sites: list, | |
| optimization_quality: str, | |
| ) -> tuple: | |
| """ | |
| Main optimization function for Gradio interface. | |
| """ | |
| # Parse sequence | |
| cleaned, auto_is_protein, error = parse_sequence(input_sequence) | |
| if error: | |
| return error, "", "", "", "" | |
| # Determine sequence type | |
| if sequence_type == "Auto-detect": | |
| is_protein = auto_is_protein | |
| else: | |
| is_protein = (sequence_type == "Protein") | |
| # Combine all excluded restriction sites | |
| all_excluded_sites = (type_ii_sites or []) + (type_iis_sites or []) + (type_iii_sites or []) | |
| excluded = get_excluded_sites(all_excluded_sites) | |
| # Map quality | |
| quality_map = { | |
| "Fast": "fast", | |
| "Standard": "standard", | |
| "Thorough": "thorough", | |
| } | |
| quality = quality_map.get(optimization_quality, "standard") | |
| try: | |
| # Run optimization | |
| result = optimize_sequence( | |
| sequence=cleaned, | |
| organism=organism, | |
| is_protein=is_protein, | |
| excluded_sites=excluded, | |
| use_nsga3=False, # Use fast hill-climbing optimizer | |
| quality=quality | |
| ) | |
| # Format outputs | |
| input_info = f"""**Input Analysis:** | |
| - Detected as: {'Protein' if is_protein else 'DNA'} | |
| - {'Amino acids' if is_protein else 'Nucleotides'}: {len(cleaned)} | |
| - Target organism: {organism} | |
| """ | |
| optimized_dna = format_sequence(result['optimized_dna']) | |
| protein_seq = format_sequence(result['protein']) | |
| metrics = result['metrics'] | |
| # Perform mRNA structure analysis | |
| mrna_analyzer = mRNAStructureAnalyzer() | |
| mrna_analysis = mrna_analyzer.analyze(result['optimized_dna']) | |
| metrics_text = f"""**Optimization Metrics:** | |
| | Metric | Value | | |
| |--------|-------| | |
| | Codon Adaptation Index (CAI) | {metrics['cai']:.3f} | | |
| | Harmony Index | {metrics['harmony_index']:.3f} | | |
| | Codon Context Index | {metrics['context_index']:.3f} | | |
| | Outlier Index | {metrics['outlier_index']:.3f} | | |
| | GC Content | {metrics['gc_content']:.1f}% | | |
| | Sequence Length | {metrics['length_bp']} bp ({metrics['length_aa']} aa) | | |
| """ | |
| # Add mRNA structure metrics if available | |
| if mrna_analysis['available']: | |
| five_prime_status = "Good" if mrna_analysis['five_prime_mfe'] > -30 else "Warning: stable structure" | |
| metrics_text += f"""**mRNA Secondary Structure:** | |
| | Metric | Value | Status | | |
| |--------|-------|--------| | |
| | 5' Region MFE (50 nt) | {mrna_analysis['five_prime_mfe']:.1f} kcal/mol | {five_prime_status} | | |
| | Full Sequence MFE | {mrna_analysis['full_mfe']:.1f} kcal/mol | - | | |
| | Hairpins Detected | {mrna_analysis['hairpin_count']} | {"None" if mrna_analysis['hairpin_count'] == 0 else "Review recommended"} | | |
| """ | |
| else: | |
| metrics_text += "*mRNA structure analysis not available (seqfold not installed)*\n\n" | |
| metrics_text += """**Interpretation:** | |
| - CAI: Higher is better (1.0 = perfect match to highly expressed genes) | |
| - Harmony Index: Higher is better (codon usage match) | |
| - Context Index: Higher is better (codon pair optimization) | |
| - Outlier Index: Lower is better (fewer adverse features) | |
| - GC Content: Optimal range is 40-60% | |
| - 5' MFE: > -30 kcal/mol recommended (less stable = better translation initiation) | |
| """ | |
| # Generate codon comparison if input was DNA | |
| if not is_protein: | |
| original_codons = sequence_to_codons(cleaned) | |
| optimized_codons = result['codons'] | |
| changes = 0 | |
| for i, (orig, opt) in enumerate(zip(original_codons, optimized_codons)): | |
| if orig != opt: | |
| changes += 1 | |
| comparison = f"\n**Codon Changes:** {changes} of {len(original_codons)} codons modified ({100*changes/len(original_codons):.1f}%)" | |
| metrics_text += comparison | |
| return input_info, optimized_dna, protein_seq, metrics_text, "" | |
| except Exception as e: | |
| return f"Error: {str(e)}", "", "", "", str(e) | |
| def download_fasta(optimized_dna: str, organism: str) -> str: | |
| """Generate FASTA format for download.""" | |
| if not optimized_dna: | |
| return "" | |
| clean_dna = optimized_dna.replace('\n', '').replace(' ', '') | |
| organism_short = organism.replace(' ', '_').replace('(', '').replace(')', '') | |
| fasta = f">Optimized_sequence|{organism_short}|{len(clean_dna)}bp\n" | |
| fasta += '\n'.join(textwrap.wrap(clean_dna, 60)) | |
| return fasta | |
| # Example sequences | |
| EXAMPLE_PROTEIN = """MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL | |
| VTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV | |
| NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD | |
| HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK""" | |
| EXAMPLE_DNA = """ATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCAAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCTGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCATGGCAGACAAACAAAAGAATGGAATCAAAGTTAACTTCAAAATTAGACACAACATTGAAGATGGAAGCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGAGAGACCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAA""" | |
| # Build Gradio interface | |
| with gr.Blocks( | |
| title="Codon Optimizer", | |
| theme=gr.themes.Soft(), | |
| ) as demo: | |
| gr.Markdown(""" | |
| # Codon Optimizer | |
| `joeyisgoed/codon-optimizer` | |
| Multi-objective codon optimization tool based on the GenScript GenSmart algorithm. | |
| Uses NSGA-III genetic algorithm to optimize for: | |
| - **Harmony Index**: Match codon usage to highly-expressed genes | |
| - **Codon Context Index**: Optimize codon pair preferences | |
| - **Outlier Index**: Minimize adverse sequence features | |
| Enter a protein or DNA sequence below to optimize it for your target expression host. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_sequence = gr.Textbox( | |
| label="Input Sequence", | |
| placeholder="Paste your protein or DNA sequence here...", | |
| lines=8, | |
| max_lines=20, | |
| ) | |
| with gr.Row(): | |
| sequence_type = gr.Radio( | |
| choices=["Auto-detect", "Protein", "DNA"], | |
| value="Auto-detect", | |
| label="Sequence Type", | |
| ) | |
| organism = gr.Dropdown( | |
| choices=get_organism_list(), | |
| value="Escherichia coli K12", | |
| label="Target Organism", | |
| ) | |
| with gr.Accordion("Exclude Restriction Sites", open=False): | |
| gr.Markdown("*Select restriction enzyme sites to avoid in the optimized sequence*") | |
| with gr.Accordion("Type II Enzymes (Standard)", open=False): | |
| type_ii_sites = gr.CheckboxGroup( | |
| choices=[f"[Type II] {name}" for name in sorted(TYPE_II_ENZYMES.keys())], | |
| label="Type II", | |
| info="Orthodox enzymes that cut within recognition sequence", | |
| ) | |
| with gr.Accordion("Type IIS Enzymes (Golden Gate/MoClo)", open=True): | |
| type_iis_sites = gr.CheckboxGroup( | |
| choices=[f"[Type IIS] {name}" for name in sorted(TYPE_IIS_ENZYMES.keys())], | |
| label="Type IIS", | |
| info="Cut outside recognition site - used for scarless cloning", | |
| ) | |
| with gr.Accordion("Type III Enzymes", open=False): | |
| type_iii_sites = gr.CheckboxGroup( | |
| choices=[f"[Type III] {name}" for name in sorted(TYPE_III_ENZYMES.keys())], | |
| label="Type III", | |
| info="Require two recognition sites in inverse orientation", | |
| ) | |
| optimization_quality = gr.Radio( | |
| choices=[ | |
| "Fast", | |
| "Standard", | |
| "Thorough", | |
| ], | |
| value="Fast", # Default to fast for better UX | |
| label="Optimization Quality", | |
| ) | |
| with gr.Row(): | |
| optimize_btn = gr.Button("Optimize Sequence", variant="primary", size="lg") | |
| clear_btn = gr.Button("Clear", size="lg") | |
| with gr.Accordion("Example Sequences", open=False): | |
| example_protein_btn = gr.Button("Load GFP Protein") | |
| example_dna_btn = gr.Button("Load GFP DNA") | |
| with gr.Column(scale=1): | |
| input_info = gr.Markdown(label="Input Analysis") | |
| optimized_dna = gr.Textbox( | |
| label="Optimized DNA Sequence", | |
| lines=8, | |
| max_lines=20, | |
| show_copy_button=True, | |
| ) | |
| protein_output = gr.Textbox( | |
| label="Protein Sequence", | |
| lines=4, | |
| max_lines=10, | |
| show_copy_button=True, | |
| ) | |
| metrics_output = gr.Markdown(label="Optimization Metrics") | |
| error_output = gr.Textbox(label="Errors", visible=False) | |
| # Event handlers | |
| optimize_btn.click( | |
| fn=optimize_codon, | |
| inputs=[input_sequence, sequence_type, organism, type_ii_sites, type_iis_sites, type_iii_sites, optimization_quality], | |
| outputs=[input_info, optimized_dna, protein_output, metrics_output, error_output], | |
| ) | |
| clear_btn.click( | |
| fn=lambda: ("", "", "", "", ""), | |
| outputs=[input_info, optimized_dna, protein_output, metrics_output, error_output], | |
| ) | |
| example_protein_btn.click( | |
| fn=lambda: EXAMPLE_PROTEIN.replace('\n', ''), | |
| outputs=[input_sequence], | |
| ) | |
| example_dna_btn.click( | |
| fn=lambda: EXAMPLE_DNA.replace('\n', ''), | |
| outputs=[input_sequence], | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### About | |
| This tool implements a codon optimization algorithm inspired by GenScript's GenSmart system | |
| ([Patent WO2020024917A1](https://patents.google.com/patent/WO2020024917A1/en)). | |
| **Features:** | |
| - Multi-objective optimization using NSGA-III algorithm | |
| - Support for 10 common expression hosts | |
| - Optional restriction site exclusion | |
| - Comprehensive optimization metrics | |
| **References:** | |
| - GenScript GenSmart Codon Optimization | |
| - NSGA-III: Deb & Jain (2014) | |
| - Codon Adaptation Index (CAI): Sharp & Li (1987) | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |