Spaces:
Sleeping
Sleeping
Add high-impact features: file upload, GFF3, E. coli example, sequence viewer
Browse filesNew Features:
1. FASTA file upload - Upload .fasta/.fa/.fna files directly
2. GFF3 export - Standard genome annotation format for detected regions
3. E. coli K-12 CRISPR example - Real organism reference sequence
4. Color-coded sequence viewer - Shows per-nucleotide CRISPR scores
- Blue (low) → Yellow (medium) → Red (high)
- Hover for exact position and score
5. Inference time display - Shows how long analysis took
UI Improvements:
- Reorganized example buttons with E. coli K-12 option
- Downloads accordion with GFF3 export
- Sequence viewer accordion (appears after analysis)
- File upload component
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -53,6 +53,11 @@ NON_CRISPR_EXAMPLE = """TTCGTTCATTTTTCTGGTTTGACCAATAGCATTTAAAGCCGCCCCACATAAATCAT
|
|
| 53 |
# This shows nice visualization with low score on flanks and high score in the middle
|
| 54 |
FLANKED_CRISPR_EXAMPLE = """ATGCGATCGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATTCCCCATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTCTGTTTACTTCCCTCTATATCTTTTTTTGTTCGGTCATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTAAAATCACACTCACAGCCAATACAAGCGGGGGGGGAAATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTTGCAGTAGGGCAGACTGGCAGTTTTCGGGTAATGATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACATTCATACGAATAATCATTTCCGAAAGACTCCTTTTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACAGGTCATGAGCATTCAAAACGTTCTCCCCGTTCAATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTAGCCTGGACCAAATAATGTACGAACCTCTCCATCTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACATGAATTATATAACAGGGATTAAAATTTTTCTTATTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTAAATTTGAGCAAATACTAAAAAAATGAGACAAAAAGATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTCCGGCAATGAATTGATAGGACTTAAAATAATTGTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTATCACGTTGAACGATCGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGAT"""
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# Longer examples for State-Dynamic Plot (upstream + CRISPR array + downstream)
|
| 57 |
# Structure: ~600bp upstream | CRISPR array (25 repeats + 24 spacers) | ~600bp downstream
|
| 58 |
# Total: ~3000 bp - ideal for seeing alternating patterns in State-Dynamic Plot
|
|
@@ -595,16 +600,101 @@ def create_interactive_state_plot(embeddings, n_clusters=8, stride=100, use_3d=F
|
|
| 595 |
return fig
|
| 596 |
|
| 597 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
def predict(sequence: str, stride: int = 100, threshold: float = 0.3):
|
| 599 |
"""Predict CRISPR array probability for each position."""
|
| 600 |
import tempfile
|
| 601 |
import csv
|
|
|
|
|
|
|
|
|
|
| 602 |
|
| 603 |
sequence = strip_fasta_header(sequence.strip())
|
| 604 |
|
| 605 |
is_valid, error = validate_sequence(sequence)
|
| 606 |
if not is_valid:
|
| 607 |
-
return None, f"**Error**: {error}", None, None, None, None, None
|
| 608 |
|
| 609 |
result = predict_sequence(sequence, stride=stride, aggregation="mean")
|
| 610 |
|
|
@@ -628,6 +718,14 @@ def predict(sequence: str, stride: int = 100, threshold: float = 0.3):
|
|
| 628 |
for pos, prob in zip(result.positions, result.probabilities):
|
| 629 |
writer.writerow([pos, f"{prob:.4f}", prob >= threshold])
|
| 630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
# Create summary text file
|
| 632 |
summary_path = os.path.join(temp_dir, "crispr_summary.txt")
|
| 633 |
summary_text = f"""CRISPR Array Detection Summary
|
|
@@ -637,6 +735,7 @@ Sequence length: {result.sequence_length:,} bp
|
|
| 637 |
Windows processed: {result.num_windows}
|
| 638 |
Stride: {stride} bp
|
| 639 |
Threshold: {threshold}
|
|
|
|
| 640 |
|
| 641 |
Overall score: {result.overall_score:.4f}
|
| 642 |
Max score: {max(result.probabilities):.4f}
|
|
@@ -662,6 +761,7 @@ Detected CRISPR Regions: {len(regions)}
|
|
| 662 |
| Overall score | {result.overall_score:.4f} |
|
| 663 |
| Max score | {max(result.probabilities):.4f} |
|
| 664 |
| Regions detected | {len(regions)} |
|
|
|
|
| 665 |
|
| 666 |
"""
|
| 667 |
if regions:
|
|
@@ -669,7 +769,7 @@ Detected CRISPR Regions: {len(regions)}
|
|
| 669 |
for r in regions:
|
| 670 |
summary += f"- **Region {r['region_id']}**: positions {r['start']:,}-{r['end']:,} ({r['length']} bp), score: {r['mean_score']:.3f}\n"
|
| 671 |
|
| 672 |
-
return fig, summary, regions, png_path, pdf_path, csv_path, summary_path
|
| 673 |
|
| 674 |
|
| 675 |
def detect(sequence: str, threshold: float = 0.3, min_length: int = 160):
|
|
@@ -814,15 +914,20 @@ Detect CRISPR arrays in DNA sequences using a BERT-based deep learning model (43
|
|
| 814 |
""")
|
| 815 |
|
| 816 |
with gr.Tab("Predict & Visualize"):
|
| 817 |
-
gr.Markdown("Paste a DNA sequence to get per-position CRISPR probability scores with interactive visualization.")
|
| 818 |
with gr.Row():
|
| 819 |
with gr.Column(scale=1):
|
| 820 |
seq_input = gr.Textbox(
|
| 821 |
label="DNA Sequence (min 1000 bp)",
|
| 822 |
placeholder="Paste DNA sequence or FASTA...",
|
| 823 |
-
lines=
|
| 824 |
value=FLANKED_CRISPR_EXAMPLE
|
| 825 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 826 |
with gr.Row():
|
| 827 |
stride_input = gr.Slider(
|
| 828 |
minimum=50, maximum=500, value=100, step=50,
|
|
@@ -833,40 +938,64 @@ Detect CRISPR arrays in DNA sequences using a BERT-based deep learning model (43
|
|
| 833 |
label="Threshold"
|
| 834 |
)
|
| 835 |
with gr.Row():
|
| 836 |
-
predict_btn = gr.Button("Analyze Sequence", variant="primary")
|
|
|
|
| 837 |
with gr.Row():
|
| 838 |
-
gr.Button("
|
| 839 |
-
lambda: CRISPR_EXAMPLE, outputs=seq_input
|
| 840 |
-
)
|
| 841 |
-
gr.Button("Flanked CRISPR (recommended)").click(
|
| 842 |
lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
|
| 843 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 844 |
gr.Button("Non-CRISPR").click(
|
| 845 |
lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
|
| 846 |
)
|
| 847 |
result_summary = gr.Markdown()
|
| 848 |
-
with gr.Accordion("Downloads", open=False, visible=False) as download_accordion:
|
| 849 |
gr.Markdown("**Plot exports:**")
|
| 850 |
with gr.Row():
|
| 851 |
pred_download_png = gr.File(label="PNG", interactive=False)
|
| 852 |
pred_download_pdf = gr.File(label="PDF", interactive=False)
|
| 853 |
gr.Markdown("**Data exports:**")
|
| 854 |
with gr.Row():
|
| 855 |
-
pred_download_csv = gr.File(label="
|
| 856 |
-
|
|
|
|
|
|
|
| 857 |
with gr.Column(scale=2):
|
| 858 |
plot_output = gr.Plot(label="CRISPR Score Profile (Interactive)")
|
|
|
|
|
|
|
| 859 |
regions_output = gr.JSON(label="Detected Regions", visible=False)
|
| 860 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
def predict_and_show_downloads(*args):
|
| 862 |
results = predict(*args)
|
| 863 |
-
#
|
| 864 |
-
|
|
|
|
| 865 |
|
| 866 |
predict_btn.click(
|
| 867 |
predict_and_show_downloads,
|
| 868 |
inputs=[seq_input, stride_input, threshold_input],
|
| 869 |
-
outputs=[plot_output, result_summary, regions_output, pred_download_png, pred_download_pdf,
|
|
|
|
|
|
|
| 870 |
)
|
| 871 |
|
| 872 |
with gr.Tab("Embeddings"):
|
|
|
|
| 53 |
# This shows nice visualization with low score on flanks and high score in the middle
|
| 54 |
FLANKED_CRISPR_EXAMPLE = """ATGCGATCGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATTCCCCATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTCTGTTTACTTCCCTCTATATCTTTTTTTGTTCGGTCATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTAAAATCACACTCACAGCCAATACAAGCGGGGGGGGAAATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTTGCAGTAGGGCAGACTGGCAGTTTTCGGGTAATGATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACATTCATACGAATAATCATTTCCGAAAGACTCCTTTTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACAGGTCATGAGCATTCAAAACGTTCTCCCCGTTCAATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTAGCCTGGACCAAATAATGTACGAACCTCTCCATCTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACATGAATTATATAACAGGGATTAAAATTTTTCTTATTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTAAATTTGAGCAAATACTAAAAAAATGAGACAAAAAGATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTCCGGCAATGAATTGATAGGACTTAAAATAATTGTATTCGAGAGCAAGATCCACTAAAACAAGGATTGAAACTATCACGTTGAACGATCGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGATCGATCGATCGATCGATCGTAGCTAGCTAGCTAGCTAGCTGATCGATCGATCGTAGCTAGCTAGCTGAT"""
|
| 55 |
|
| 56 |
+
# E. coli K-12 MG1655 CRISPR I-E example (based on real genomic region)
|
| 57 |
+
# Contains the characteristic 29bp repeat: CGGTTTATCCCCGCTGGCGCGGGGAACTC
|
| 58 |
+
# Structure: ~400bp upstream (cas genes) + CRISPR array (8 repeats + 7 spacers) + ~400bp downstream
|
| 59 |
+
ECOLI_CRISPR_EXAMPLE = """ATGGATGAACGAAATCGTCAGGTGCTGGAACAACGCCTGCGCCAGCATATCGATGCGCTGGAAGCGCGCAGCAATGATGTCACCTGCCAGACGCTGGAACTGCTGCGCGATGGCGACGTACTGGATGCCGTGCTGGCGGATGCCCGCAAAGAGCTGGACGCACACCGCTTCCTGCTGGAAGACGGCTACACCACGCTGCAACAGATCGCCAACCTGCCGGGCGTGACCTCGATGCTGGACGACGGCGACATCCACCTGCACTGCGTGCTCGGCGTGCCGCAGCGCCGTGGCGAACATATCGAACAGTTCGCCCGCGAGCATTACCAGAATCCGCTGCAAACGCTGCGCGAGTGACGGTTTATCCCCGCTGGCGCGGGGAACTCGAAAGCTACGTTGATATTGCGCTATCTCATCGACGGTTTATCCCCGCTGGCGCGGGGAACTCTGCAGAACTCGAGGGATGAAACGGTCTTGCGGTTTATCCCCGCTGGCGCGGGGAACTCAATGAAGAAATGCTTCGATTTCGTAGCCGTTCGGTTTATCCCCGCTGGCGCGGGGAACTCGTTGTCTGGATGGATCGATCAATCTCATACAACGGTTTATCCCCGCTGGCGCGGGGAACTCCAGAACGATTCGCCACGGTCTGTTGATTAACCGGTTTATCCCCGCTGGCGCGGGGAACTCTGAAGTTGATGATGATTCCGATCAGCACCACGGTTTATCCCCGCTGGCGCGGGGAACTCATGATCTTGCAGGCGCGCCAGCACTTCAGCCATCGGTTTATCCCCGCTGGCGCGGGGAACTCGCGATGGCGATTTCATTACTGATGCGGCGTGAGCGTGGTGCAACATCCGCGCCCGCTGACGCGTTTTTTTGTATCCGGATAGCGTCAGCCGATGGCTGAAGCGGCGAGCAAGCTCTGAAGCGCAGCGCAATCGCGCCCTGATGGCGATGGCGCGTAATGATTTCACCGACGATATCGACATCGATATCGTCCAGGCTGCGCAGGATCAGGGCGATACGCAAACGCCCGCCTTCGCCAGCGATAATGCTGCCGCCACCCAGCAGCGCGCCCCAGAACACGGCGGCGAGGATGACGATGAAGCCGAAACGCCACAGCAGGCTGCCACAGCC"""
|
| 60 |
+
|
| 61 |
# Longer examples for State-Dynamic Plot (upstream + CRISPR array + downstream)
|
| 62 |
# Structure: ~600bp upstream | CRISPR array (25 repeats + 24 spacers) | ~600bp downstream
|
| 63 |
# Total: ~3000 bp - ideal for seeing alternating patterns in State-Dynamic Plot
|
|
|
|
| 600 |
return fig
|
| 601 |
|
| 602 |
|
| 603 |
+
def parse_fasta_file(file_path):
|
| 604 |
+
"""Parse a FASTA file and return the sequence."""
|
| 605 |
+
if file_path is None:
|
| 606 |
+
return None
|
| 607 |
+
with open(file_path, 'r') as f:
|
| 608 |
+
content = f.read()
|
| 609 |
+
return strip_fasta_header(content.strip())
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def create_gff3_export(regions, sequence_length, sequence_id="input_sequence"):
|
| 613 |
+
"""Create GFF3 format annotation file for detected CRISPR regions."""
|
| 614 |
+
import tempfile
|
| 615 |
+
gff_path = os.path.join(tempfile.gettempdir(), "crispr_regions.gff3")
|
| 616 |
+
|
| 617 |
+
with open(gff_path, 'w') as f:
|
| 618 |
+
# GFF3 header
|
| 619 |
+
f.write("##gff-version 3\n")
|
| 620 |
+
f.write(f"##sequence-region {sequence_id} 1 {sequence_length}\n")
|
| 621 |
+
|
| 622 |
+
for r in regions:
|
| 623 |
+
# GFF3 format: seqid source type start end score strand phase attributes
|
| 624 |
+
attributes = f"ID=CRISPR_{r['region_id']};Name=CRISPR_array_{r['region_id']};score={r['mean_score']:.3f}"
|
| 625 |
+
f.write(f"{sequence_id}\tCRISPR-BERT\tCRISPR_array\t{r['start']+1}\t{r['end']}\t{r['mean_score']:.3f}\t.\t.\t{attributes}\n")
|
| 626 |
+
|
| 627 |
+
return gff_path
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
def create_sequence_viewer_html(sequence, positions, probabilities, threshold=0.3, chunk_size=100):
|
| 631 |
+
"""Create an HTML visualization of the sequence with color-coded scores."""
|
| 632 |
+
# Interpolate scores to per-nucleotide level
|
| 633 |
+
import numpy as np
|
| 634 |
+
|
| 635 |
+
seq_len = len(sequence)
|
| 636 |
+
per_base_scores = np.zeros(seq_len)
|
| 637 |
+
|
| 638 |
+
# Map window scores to positions
|
| 639 |
+
for i, (pos, prob) in enumerate(zip(positions, probabilities)):
|
| 640 |
+
start = pos
|
| 641 |
+
end = min(pos + 1000, seq_len) # window size
|
| 642 |
+
# Average with existing scores for overlapping windows
|
| 643 |
+
for j in range(start, end):
|
| 644 |
+
if per_base_scores[j] == 0:
|
| 645 |
+
per_base_scores[j] = prob
|
| 646 |
+
else:
|
| 647 |
+
per_base_scores[j] = (per_base_scores[j] + prob) / 2
|
| 648 |
+
|
| 649 |
+
# Generate HTML
|
| 650 |
+
html_parts = ['<div style="font-family: monospace; font-size: 12px; line-height: 1.8; background: #f8f9fa; padding: 15px; border-radius: 8px; max-height: 400px; overflow-y: auto;">']
|
| 651 |
+
html_parts.append('<div style="margin-bottom: 10px; font-family: sans-serif; font-size: 13px;">')
|
| 652 |
+
html_parts.append('<span style="background: linear-gradient(to right, #3b82f6, #fbbf24, #ef4444); padding: 2px 20px; border-radius: 3px; color: white;">Low → Medium → High CRISPR Score</span>')
|
| 653 |
+
html_parts.append(f'<span style="margin-left: 15px;">Threshold: {threshold}</span>')
|
| 654 |
+
html_parts.append('</div>')
|
| 655 |
+
|
| 656 |
+
# Process sequence in chunks with position markers
|
| 657 |
+
for chunk_start in range(0, seq_len, chunk_size):
|
| 658 |
+
chunk_end = min(chunk_start + chunk_size, seq_len)
|
| 659 |
+
chunk_seq = sequence[chunk_start:chunk_end]
|
| 660 |
+
chunk_scores = per_base_scores[chunk_start:chunk_end]
|
| 661 |
+
|
| 662 |
+
# Position marker
|
| 663 |
+
html_parts.append(f'<div><span style="color: #666; width: 60px; display: inline-block; font-size: 11px;">{chunk_start+1:,}</span>')
|
| 664 |
+
|
| 665 |
+
for i, (base, score) in enumerate(zip(chunk_seq, chunk_scores)):
|
| 666 |
+
# Color based on score: blue (low) -> yellow (medium) -> red (high)
|
| 667 |
+
if score < threshold * 0.5:
|
| 668 |
+
color = "#3b82f6" # blue
|
| 669 |
+
elif score < threshold:
|
| 670 |
+
color = "#fbbf24" # yellow
|
| 671 |
+
elif score < threshold * 1.5:
|
| 672 |
+
color = "#f97316" # orange
|
| 673 |
+
else:
|
| 674 |
+
color = "#ef4444" # red
|
| 675 |
+
|
| 676 |
+
bg_opacity = min(0.3 + score * 0.7, 1.0)
|
| 677 |
+
html_parts.append(f'<span style="color: {color}; background-color: rgba(0,0,0,{bg_opacity * 0.1}); font-weight: {"bold" if score >= threshold else "normal"};" title="Pos {chunk_start + i + 1}: {score:.3f}">{base}</span>')
|
| 678 |
+
|
| 679 |
+
html_parts.append('</div>')
|
| 680 |
+
|
| 681 |
+
html_parts.append('</div>')
|
| 682 |
+
return ''.join(html_parts)
|
| 683 |
+
|
| 684 |
+
|
| 685 |
def predict(sequence: str, stride: int = 100, threshold: float = 0.3):
|
| 686 |
"""Predict CRISPR array probability for each position."""
|
| 687 |
import tempfile
|
| 688 |
import csv
|
| 689 |
+
import time
|
| 690 |
+
|
| 691 |
+
start_time = time.time()
|
| 692 |
|
| 693 |
sequence = strip_fasta_header(sequence.strip())
|
| 694 |
|
| 695 |
is_valid, error = validate_sequence(sequence)
|
| 696 |
if not is_valid:
|
| 697 |
+
return None, f"**Error**: {error}", None, None, None, None, None, None, None
|
| 698 |
|
| 699 |
result = predict_sequence(sequence, stride=stride, aggregation="mean")
|
| 700 |
|
|
|
|
| 718 |
for pos, prob in zip(result.positions, result.probabilities):
|
| 719 |
writer.writerow([pos, f"{prob:.4f}", prob >= threshold])
|
| 720 |
|
| 721 |
+
# Create GFF3 export
|
| 722 |
+
gff_path = create_gff3_export(regions, result.sequence_length) if regions else None
|
| 723 |
+
|
| 724 |
+
# Create sequence viewer HTML
|
| 725 |
+
seq_viewer_html = create_sequence_viewer_html(sequence, result.positions, result.probabilities, threshold)
|
| 726 |
+
|
| 727 |
+
elapsed_time = time.time() - start_time
|
| 728 |
+
|
| 729 |
# Create summary text file
|
| 730 |
summary_path = os.path.join(temp_dir, "crispr_summary.txt")
|
| 731 |
summary_text = f"""CRISPR Array Detection Summary
|
|
|
|
| 735 |
Windows processed: {result.num_windows}
|
| 736 |
Stride: {stride} bp
|
| 737 |
Threshold: {threshold}
|
| 738 |
+
Inference time: {elapsed_time:.2f} seconds
|
| 739 |
|
| 740 |
Overall score: {result.overall_score:.4f}
|
| 741 |
Max score: {max(result.probabilities):.4f}
|
|
|
|
| 761 |
| Overall score | {result.overall_score:.4f} |
|
| 762 |
| Max score | {max(result.probabilities):.4f} |
|
| 763 |
| Regions detected | {len(regions)} |
|
| 764 |
+
| Inference time | {elapsed_time:.2f}s |
|
| 765 |
|
| 766 |
"""
|
| 767 |
if regions:
|
|
|
|
| 769 |
for r in regions:
|
| 770 |
summary += f"- **Region {r['region_id']}**: positions {r['start']:,}-{r['end']:,} ({r['length']} bp), score: {r['mean_score']:.3f}\n"
|
| 771 |
|
| 772 |
+
return fig, summary, regions, png_path, pdf_path, csv_path, summary_path, gff_path, seq_viewer_html
|
| 773 |
|
| 774 |
|
| 775 |
def detect(sequence: str, threshold: float = 0.3, min_length: int = 160):
|
|
|
|
| 914 |
""")
|
| 915 |
|
| 916 |
with gr.Tab("Predict & Visualize"):
|
| 917 |
+
gr.Markdown("Paste a DNA sequence or upload a FASTA file to get per-position CRISPR probability scores with interactive visualization.")
|
| 918 |
with gr.Row():
|
| 919 |
with gr.Column(scale=1):
|
| 920 |
seq_input = gr.Textbox(
|
| 921 |
label="DNA Sequence (min 1000 bp)",
|
| 922 |
placeholder="Paste DNA sequence or FASTA...",
|
| 923 |
+
lines=6,
|
| 924 |
value=FLANKED_CRISPR_EXAMPLE
|
| 925 |
)
|
| 926 |
+
file_upload = gr.File(
|
| 927 |
+
label="Or upload FASTA file",
|
| 928 |
+
file_types=[".fasta", ".fa", ".fna", ".txt"],
|
| 929 |
+
type="filepath"
|
| 930 |
+
)
|
| 931 |
with gr.Row():
|
| 932 |
stride_input = gr.Slider(
|
| 933 |
minimum=50, maximum=500, value=100, step=50,
|
|
|
|
| 938 |
label="Threshold"
|
| 939 |
)
|
| 940 |
with gr.Row():
|
| 941 |
+
predict_btn = gr.Button("🔬 Analyze Sequence", variant="primary", size="lg")
|
| 942 |
+
gr.Markdown("**Load example:**")
|
| 943 |
with gr.Row():
|
| 944 |
+
gr.Button("Flanked CRISPR").click(
|
|
|
|
|
|
|
|
|
|
| 945 |
lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
|
| 946 |
)
|
| 947 |
+
gr.Button("E. coli K-12").click(
|
| 948 |
+
lambda: ECOLI_CRISPR_EXAMPLE, outputs=seq_input
|
| 949 |
+
)
|
| 950 |
+
with gr.Row():
|
| 951 |
+
gr.Button("CRISPR Only").click(
|
| 952 |
+
lambda: CRISPR_EXAMPLE, outputs=seq_input
|
| 953 |
+
)
|
| 954 |
gr.Button("Non-CRISPR").click(
|
| 955 |
lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
|
| 956 |
)
|
| 957 |
result_summary = gr.Markdown()
|
| 958 |
+
with gr.Accordion("📥 Downloads", open=False, visible=False) as download_accordion:
|
| 959 |
gr.Markdown("**Plot exports:**")
|
| 960 |
with gr.Row():
|
| 961 |
pred_download_png = gr.File(label="PNG", interactive=False)
|
| 962 |
pred_download_pdf = gr.File(label="PDF", interactive=False)
|
| 963 |
gr.Markdown("**Data exports:**")
|
| 964 |
with gr.Row():
|
| 965 |
+
pred_download_csv = gr.File(label="CSV", interactive=False)
|
| 966 |
+
pred_download_gff = gr.File(label="GFF3", interactive=False)
|
| 967 |
+
with gr.Row():
|
| 968 |
+
pred_download_summary = gr.File(label="Summary", interactive=False)
|
| 969 |
with gr.Column(scale=2):
|
| 970 |
plot_output = gr.Plot(label="CRISPR Score Profile (Interactive)")
|
| 971 |
+
with gr.Accordion("🧬 Sequence Viewer", open=False, visible=False) as seq_viewer_accordion:
|
| 972 |
+
seq_viewer_html = gr.HTML(label="Color-coded sequence")
|
| 973 |
regions_output = gr.JSON(label="Detected Regions", visible=False)
|
| 974 |
|
| 975 |
+
# Handle file upload - load content into textbox
|
| 976 |
+
def load_file_to_textbox(file_path):
|
| 977 |
+
if file_path:
|
| 978 |
+
return parse_fasta_file(file_path)
|
| 979 |
+
return gr.update()
|
| 980 |
+
|
| 981 |
+
file_upload.change(
|
| 982 |
+
load_file_to_textbox,
|
| 983 |
+
inputs=[file_upload],
|
| 984 |
+
outputs=[seq_input]
|
| 985 |
+
)
|
| 986 |
+
|
| 987 |
def predict_and_show_downloads(*args):
|
| 988 |
results = predict(*args)
|
| 989 |
+
# results = (fig, summary, regions, png, pdf, csv, summary_txt, gff, seq_html)
|
| 990 |
+
# Return results plus visibility updates for accordions
|
| 991 |
+
return results + (gr.update(visible=True), gr.update(visible=True))
|
| 992 |
|
| 993 |
predict_btn.click(
|
| 994 |
predict_and_show_downloads,
|
| 995 |
inputs=[seq_input, stride_input, threshold_input],
|
| 996 |
+
outputs=[plot_output, result_summary, regions_output, pred_download_png, pred_download_pdf,
|
| 997 |
+
pred_download_csv, pred_download_summary, pred_download_gff, seq_viewer_html,
|
| 998 |
+
download_accordion, seq_viewer_accordion]
|
| 999 |
)
|
| 1000 |
|
| 1001 |
with gr.Tab("Embeddings"):
|