Spaces:
Sleeping
Sleeping
Add API documentation, help tooltips, zoom controls, and professional text
Browse files- Add API Documentation tab with Python client and cURL examples
- Add help tooltips to Stride and Threshold sliders explaining their function
- Add zoom controls (500bp, 1kb, 5kb, Full) using Plotly rangeselector
- Add minimap navigator using Plotly rangeslider for sequence overview
- Remove emojis throughout UI for professional appearance
- Update main description with technical/scientific language
- Update Embeddings tab with detailed mode descriptions
- Update About tab with architecture table, training details, parameter reference
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -98,18 +98,20 @@ def create_prediction_plot(positions, probabilities, threshold=0.3, regions=None
|
|
| 98 |
|
| 99 |
|
| 100 |
def create_interactive_prediction_plot(positions, probabilities, threshold=0.3, regions=None):
|
| 101 |
-
"""Create an interactive Plotly figure showing the prediction curve."""
|
| 102 |
fig = go.Figure()
|
| 103 |
|
|
|
|
|
|
|
| 104 |
# Main probability curve with fill
|
| 105 |
fig.add_trace(go.Scatter(
|
| 106 |
x=positions,
|
| 107 |
y=probabilities,
|
| 108 |
mode='lines',
|
| 109 |
-
name='
|
| 110 |
-
line=dict(color='
|
| 111 |
fill='tozeroy',
|
| 112 |
-
fillcolor='rgba(
|
| 113 |
hovertemplate='Position: %{x:,} bp<br>Score: %{y:.3f}<extra></extra>'
|
| 114 |
))
|
| 115 |
|
|
@@ -117,9 +119,10 @@ def create_interactive_prediction_plot(positions, probabilities, threshold=0.3,
|
|
| 117 |
fig.add_hline(
|
| 118 |
y=threshold,
|
| 119 |
line_dash="dash",
|
| 120 |
-
line_color="
|
| 121 |
annotation_text=f"Threshold ({threshold})",
|
| 122 |
-
annotation_position="top right"
|
|
|
|
| 123 |
)
|
| 124 |
|
| 125 |
# Highlight detected CRISPR regions
|
|
@@ -127,25 +130,73 @@ def create_interactive_prediction_plot(positions, probabilities, threshold=0.3,
|
|
| 127 |
for r in regions:
|
| 128 |
fig.add_vrect(
|
| 129 |
x0=r['start'], x1=r['end'],
|
| 130 |
-
fillcolor="rgba(
|
| 131 |
layer="below",
|
| 132 |
-
line_width=
|
| 133 |
-
|
|
|
|
| 134 |
annotation_position="top left",
|
| 135 |
-
annotation_font_size=10
|
|
|
|
| 136 |
)
|
| 137 |
|
| 138 |
fig.update_layout(
|
| 139 |
-
title=dict(
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
hovermode='x unified',
|
| 145 |
showlegend=True,
|
| 146 |
-
legend=dict(
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
return fig
|
|
@@ -908,67 +959,72 @@ with gr.Blocks(title="CRISPR Array Detection") as demo:
|
|
| 908 |
gr.Markdown("""
|
| 909 |
# CRISPR Array Detection
|
| 910 |
|
| 911 |
-
|
|
|
|
|
|
|
| 912 |
|
| 913 |
-
**
|
| 914 |
""")
|
| 915 |
|
| 916 |
-
with gr.Tab("
|
| 917 |
-
gr.Markdown("Paste a DNA sequence or upload a FASTA file to get per-position CRISPR probability scores with interactive visualization.")
|
| 918 |
with gr.Row():
|
| 919 |
with gr.Column(scale=1):
|
| 920 |
seq_input = gr.Textbox(
|
| 921 |
-
label="
|
| 922 |
-
placeholder="Paste DNA sequence
|
| 923 |
lines=6,
|
| 924 |
-
value=FLANKED_CRISPR_EXAMPLE
|
|
|
|
| 925 |
)
|
| 926 |
file_upload = gr.File(
|
| 927 |
-
label="
|
| 928 |
file_types=[".fasta", ".fa", ".fna", ".txt"],
|
| 929 |
type="filepath"
|
| 930 |
)
|
| 931 |
with gr.Row():
|
| 932 |
stride_input = gr.Slider(
|
| 933 |
minimum=50, maximum=500, value=100, step=50,
|
| 934 |
-
label="Stride"
|
|
|
|
| 935 |
)
|
| 936 |
threshold_input = gr.Slider(
|
| 937 |
minimum=0.1, maximum=0.9, value=0.3, step=0.05,
|
| 938 |
-
label="Threshold"
|
|
|
|
| 939 |
)
|
| 940 |
with gr.Row():
|
| 941 |
-
predict_btn = gr.Button("
|
| 942 |
-
gr.Markdown("**
|
| 943 |
with gr.Row():
|
| 944 |
gr.Button("Flanked CRISPR").click(
|
| 945 |
lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
|
| 946 |
)
|
| 947 |
-
gr.Button("E. coli K-12").click(
|
| 948 |
lambda: ECOLI_CRISPR_EXAMPLE, outputs=seq_input
|
| 949 |
)
|
| 950 |
with gr.Row():
|
| 951 |
-
gr.Button("CRISPR
|
| 952 |
lambda: CRISPR_EXAMPLE, outputs=seq_input
|
| 953 |
)
|
| 954 |
-
gr.Button("
|
| 955 |
lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
|
| 956 |
)
|
| 957 |
result_summary = gr.Markdown()
|
| 958 |
-
with gr.Accordion("
|
| 959 |
-
gr.Markdown("**
|
| 960 |
with gr.Row():
|
| 961 |
pred_download_png = gr.File(label="PNG", interactive=False)
|
| 962 |
pred_download_pdf = gr.File(label="PDF", interactive=False)
|
| 963 |
-
gr.Markdown("**Data
|
| 964 |
with gr.Row():
|
| 965 |
pred_download_csv = gr.File(label="CSV", interactive=False)
|
| 966 |
pred_download_gff = gr.File(label="GFF3", interactive=False)
|
| 967 |
with gr.Row():
|
| 968 |
pred_download_summary = gr.File(label="Summary", interactive=False)
|
| 969 |
with gr.Column(scale=2):
|
| 970 |
-
plot_output = gr.Plot(label="
|
| 971 |
-
with gr.Accordion("
|
|
|
|
| 972 |
seq_viewer_html = gr.HTML(label="Color-coded sequence")
|
| 973 |
regions_output = gr.JSON(label="Detected Regions", visible=False)
|
| 974 |
|
|
@@ -999,54 +1055,53 @@ Detect CRISPR arrays in DNA sequences using a BERT-based deep learning model (43
|
|
| 999 |
)
|
| 1000 |
|
| 1001 |
with gr.Tab("Embeddings"):
|
| 1002 |
-
gr.Markdown("""
|
|
|
|
| 1003 |
|
| 1004 |
-
|
| 1005 |
|
| 1006 |
-
**
|
| 1007 |
""")
|
| 1008 |
with gr.Row():
|
| 1009 |
with gr.Column(scale=1):
|
| 1010 |
embed_seq = gr.Textbox(
|
| 1011 |
-
label="
|
| 1012 |
placeholder="Paste DNA sequence...",
|
| 1013 |
lines=6,
|
| 1014 |
-
value=EMBEDDING_CRISPR_EXAMPLE
|
|
|
|
| 1015 |
)
|
| 1016 |
embed_mode = gr.Radio(
|
| 1017 |
choices=["state-dynamics", "mean", "max", "trajectory"],
|
| 1018 |
value="state-dynamics",
|
| 1019 |
label="Visualization Mode",
|
| 1020 |
-
info="state-dynamics:
|
| 1021 |
)
|
| 1022 |
use_3d = gr.Checkbox(
|
| 1023 |
-
label="3D
|
| 1024 |
value=False,
|
| 1025 |
-
info="
|
| 1026 |
visible=True
|
| 1027 |
)
|
| 1028 |
with gr.Row():
|
| 1029 |
-
embed_btn = gr.Button("
|
| 1030 |
with gr.Row():
|
| 1031 |
-
gr.Button("
|
| 1032 |
lambda: EMBEDDING_CRISPR_EXAMPLE, outputs=embed_seq
|
| 1033 |
)
|
| 1034 |
-
gr.Button("
|
| 1035 |
lambda: EMBEDDING_RANDOM_EXAMPLE, outputs=embed_seq
|
| 1036 |
)
|
| 1037 |
gr.Markdown("""
|
| 1038 |
-
**
|
| 1039 |
-
- Upstream: 0-600 bp (random)
|
| 1040 |
-
- Array: 600-2364 bp (25 repeats + 24 spacers)
|
| 1041 |
-
- Downstream: 2364-2964 bp (random)
|
| 1042 |
""")
|
| 1043 |
embed_summary = gr.Markdown()
|
| 1044 |
-
with gr.Accordion("
|
| 1045 |
with gr.Row():
|
| 1046 |
download_png = gr.File(label="PNG", interactive=False)
|
| 1047 |
download_pdf = gr.File(label="PDF", interactive=False)
|
| 1048 |
with gr.Column(scale=2):
|
| 1049 |
-
embed_plot = gr.Plot(label="Embedding Visualization
|
| 1050 |
|
| 1051 |
# Show/hide 3D checkbox based on mode
|
| 1052 |
embed_mode.change(
|
|
@@ -1065,40 +1120,121 @@ Visualize how the model's internal representation changes across the sequence. T
|
|
| 1065 |
outputs=[embed_plot, embed_summary, download_png, download_pdf, embed_download_accordion]
|
| 1066 |
)
|
| 1067 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1068 |
with gr.Tab("About"):
|
| 1069 |
gr.Markdown("""
|
| 1070 |
-
## Model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1071 |
|
| 1072 |
-
|
| 1073 |
-
- **Parameters**: ~430 million
|
| 1074 |
-
- **Pre-training**: BERT model trained on metagenomic and genomic microbial sequences
|
| 1075 |
-
- **Fine-tuning**: Trained on annotated CRISPR arrays from bacterial genomes
|
| 1076 |
-
- **Input**: DNA sequence (1000 bp windows)
|
| 1077 |
-
- **Output**: Per-position probability (0 = non-CRISPR, 1 = CRISPR)
|
| 1078 |
|
| 1079 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1080 |
|
| 1081 |
-
|
| 1082 |
-
2. **Click "Analyze Sequence"** to run the model
|
| 1083 |
-
3. **View the plot** showing CRISPR probability along the sequence
|
| 1084 |
-
4. Regions above the threshold (red) are predicted CRISPR arrays
|
| 1085 |
|
| 1086 |
-
|
|
|
|
|
|
|
| 1087 |
|
| 1088 |
-
|
| 1089 |
-
- Adjust **threshold** based on your needs: lower = more sensitive, higher = more specific
|
| 1090 |
-
- The model works best on bacterial/archaeal genomic sequences
|
| 1091 |
|
| 1092 |
-
|
| 1093 |
|
| 1094 |
-
|
| 1095 |
|
| 1096 |
-
## Acknowledgements
|
| 1097 |
|
| 1098 |
-
- Ziyu Mu
|
| 1099 |
-
- DFG SPP 2141 (
|
| 1100 |
-
- BMBF GenomeNet
|
| 1101 |
-
- Helmholtz Centre for Infection Research
|
| 1102 |
""")
|
| 1103 |
|
| 1104 |
|
|
|
|
| 98 |
|
| 99 |
|
| 100 |
def create_interactive_prediction_plot(positions, probabilities, threshold=0.3, regions=None):
|
| 101 |
+
"""Create an interactive Plotly figure showing the prediction curve with minimap."""
|
| 102 |
fig = go.Figure()
|
| 103 |
|
| 104 |
+
max_pos = max(positions) if positions else 1000
|
| 105 |
+
|
| 106 |
# Main probability curve with fill
|
| 107 |
fig.add_trace(go.Scatter(
|
| 108 |
x=positions,
|
| 109 |
y=probabilities,
|
| 110 |
mode='lines',
|
| 111 |
+
name='Prediction Score',
|
| 112 |
+
line=dict(color='#2563eb', width=1.5),
|
| 113 |
fill='tozeroy',
|
| 114 |
+
fillcolor='rgba(37, 99, 235, 0.15)',
|
| 115 |
hovertemplate='Position: %{x:,} bp<br>Score: %{y:.3f}<extra></extra>'
|
| 116 |
))
|
| 117 |
|
|
|
|
| 119 |
fig.add_hline(
|
| 120 |
y=threshold,
|
| 121 |
line_dash="dash",
|
| 122 |
+
line_color="#dc2626",
|
| 123 |
annotation_text=f"Threshold ({threshold})",
|
| 124 |
+
annotation_position="top right",
|
| 125 |
+
annotation_font_size=11
|
| 126 |
)
|
| 127 |
|
| 128 |
# Highlight detected CRISPR regions
|
|
|
|
| 130 |
for r in regions:
|
| 131 |
fig.add_vrect(
|
| 132 |
x0=r['start'], x1=r['end'],
|
| 133 |
+
fillcolor="rgba(220, 38, 38, 0.12)",
|
| 134 |
layer="below",
|
| 135 |
+
line_width=1,
|
| 136 |
+
line_color="rgba(220, 38, 38, 0.3)",
|
| 137 |
+
annotation_text=f"CRISPR {r['region_id']}",
|
| 138 |
annotation_position="top left",
|
| 139 |
+
annotation_font_size=10,
|
| 140 |
+
annotation_font_color="#dc2626"
|
| 141 |
)
|
| 142 |
|
| 143 |
fig.update_layout(
|
| 144 |
+
title=dict(
|
| 145 |
+
text='CRISPR Array Detection',
|
| 146 |
+
font=dict(size=14, color='#1f2937'),
|
| 147 |
+
x=0.5,
|
| 148 |
+
xanchor='center'
|
| 149 |
+
),
|
| 150 |
+
xaxis=dict(
|
| 151 |
+
title='Position (bp)',
|
| 152 |
+
range=[0, max_pos],
|
| 153 |
+
gridcolor='#e5e7eb',
|
| 154 |
+
showgrid=True,
|
| 155 |
+
zeroline=False,
|
| 156 |
+
# Rangeslider for minimap navigation
|
| 157 |
+
rangeslider=dict(
|
| 158 |
+
visible=True,
|
| 159 |
+
thickness=0.08,
|
| 160 |
+
bgcolor='#f3f4f6',
|
| 161 |
+
bordercolor='#d1d5db',
|
| 162 |
+
borderwidth=1
|
| 163 |
+
),
|
| 164 |
+
# Range selector buttons for quick zoom
|
| 165 |
+
rangeselector=dict(
|
| 166 |
+
buttons=list([
|
| 167 |
+
dict(count=500, label="500bp", step="all", stepmode="backward"),
|
| 168 |
+
dict(count=1000, label="1kb", step="all", stepmode="backward"),
|
| 169 |
+
dict(count=5000, label="5kb", step="all", stepmode="backward"),
|
| 170 |
+
dict(step="all", label="Full")
|
| 171 |
+
]),
|
| 172 |
+
bgcolor='#f9fafb',
|
| 173 |
+
bordercolor='#d1d5db',
|
| 174 |
+
font=dict(size=10),
|
| 175 |
+
x=0,
|
| 176 |
+
y=1.15
|
| 177 |
+
)
|
| 178 |
+
),
|
| 179 |
+
yaxis=dict(
|
| 180 |
+
title='CRISPR Probability',
|
| 181 |
+
range=[0, 1.05],
|
| 182 |
+
gridcolor='#e5e7eb',
|
| 183 |
+
showgrid=True,
|
| 184 |
+
zeroline=False,
|
| 185 |
+
tickformat='.1f'
|
| 186 |
+
),
|
| 187 |
hovermode='x unified',
|
| 188 |
showlegend=True,
|
| 189 |
+
legend=dict(
|
| 190 |
+
yanchor="top", y=0.99,
|
| 191 |
+
xanchor="right", x=0.99,
|
| 192 |
+
bgcolor='rgba(255,255,255,0.8)',
|
| 193 |
+
bordercolor='#e5e7eb',
|
| 194 |
+
borderwidth=1
|
| 195 |
+
),
|
| 196 |
+
height=480,
|
| 197 |
+
plot_bgcolor='white',
|
| 198 |
+
paper_bgcolor='white',
|
| 199 |
+
margin=dict(t=80, b=60)
|
| 200 |
)
|
| 201 |
|
| 202 |
return fig
|
|
|
|
| 959 |
gr.Markdown("""
|
| 960 |
# CRISPR Array Detection
|
| 961 |
|
| 962 |
+
A deep learning approach for identifying CRISPR arrays in prokaryotic genome sequences. This tool employs a 24-layer BERT transformer architecture (~430M parameters) that was pre-trained on metagenomic contigs and complete microbial genomes, then fine-tuned on annotated CRISPR array sequences.
|
| 963 |
+
|
| 964 |
+
**Method**: Input sequences are processed using a sliding window approach (1000 bp window, configurable stride). For each window, the model outputs a probability score ∈ [0,1] indicating the likelihood that the central region contains part of a CRISPR array. Overlapping predictions are aggregated to produce per-position scores across the full sequence length.
|
| 965 |
|
| 966 |
+
**Output**: Detected CRISPR regions are reported with genomic coordinates, mean prediction scores, and can be exported in standard formats (GFF3, CSV) for downstream analysis.
|
| 967 |
""")
|
| 968 |
|
| 969 |
+
with gr.Tab("Prediction"):
|
|
|
|
| 970 |
with gr.Row():
|
| 971 |
with gr.Column(scale=1):
|
| 972 |
seq_input = gr.Textbox(
|
| 973 |
+
label="Input Sequence",
|
| 974 |
+
placeholder="Paste DNA sequence (FASTA format accepted)...",
|
| 975 |
lines=6,
|
| 976 |
+
value=FLANKED_CRISPR_EXAMPLE,
|
| 977 |
+
info="Minimum length: 1000 bp. Accepts raw sequence or FASTA format."
|
| 978 |
)
|
| 979 |
file_upload = gr.File(
|
| 980 |
+
label="Upload FASTA File",
|
| 981 |
file_types=[".fasta", ".fa", ".fna", ".txt"],
|
| 982 |
type="filepath"
|
| 983 |
)
|
| 984 |
with gr.Row():
|
| 985 |
stride_input = gr.Slider(
|
| 986 |
minimum=50, maximum=500, value=100, step=50,
|
| 987 |
+
label="Stride (bp)",
|
| 988 |
+
info="Step size between consecutive windows. Lower values increase resolution but require more computation."
|
| 989 |
)
|
| 990 |
threshold_input = gr.Slider(
|
| 991 |
minimum=0.1, maximum=0.9, value=0.3, step=0.05,
|
| 992 |
+
label="Detection Threshold",
|
| 993 |
+
info="Minimum score to classify a region as CRISPR. Lower = more sensitive, higher = more specific."
|
| 994 |
)
|
| 995 |
with gr.Row():
|
| 996 |
+
predict_btn = gr.Button("Run Analysis", variant="primary", size="lg")
|
| 997 |
+
gr.Markdown("**Example sequences:**")
|
| 998 |
with gr.Row():
|
| 999 |
gr.Button("Flanked CRISPR").click(
|
| 1000 |
lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
|
| 1001 |
)
|
| 1002 |
+
gr.Button("E. coli K-12 CRISPR I-E").click(
|
| 1003 |
lambda: ECOLI_CRISPR_EXAMPLE, outputs=seq_input
|
| 1004 |
)
|
| 1005 |
with gr.Row():
|
| 1006 |
+
gr.Button("CRISPR Array").click(
|
| 1007 |
lambda: CRISPR_EXAMPLE, outputs=seq_input
|
| 1008 |
)
|
| 1009 |
+
gr.Button("Negative Control").click(
|
| 1010 |
lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
|
| 1011 |
)
|
| 1012 |
result_summary = gr.Markdown()
|
| 1013 |
+
with gr.Accordion("Export Results", open=False, visible=False) as download_accordion:
|
| 1014 |
+
gr.Markdown("**Figures:**")
|
| 1015 |
with gr.Row():
|
| 1016 |
pred_download_png = gr.File(label="PNG", interactive=False)
|
| 1017 |
pred_download_pdf = gr.File(label="PDF", interactive=False)
|
| 1018 |
+
gr.Markdown("**Data:**")
|
| 1019 |
with gr.Row():
|
| 1020 |
pred_download_csv = gr.File(label="CSV", interactive=False)
|
| 1021 |
pred_download_gff = gr.File(label="GFF3", interactive=False)
|
| 1022 |
with gr.Row():
|
| 1023 |
pred_download_summary = gr.File(label="Summary", interactive=False)
|
| 1024 |
with gr.Column(scale=2):
|
| 1025 |
+
plot_output = gr.Plot(label="Prediction Score Profile")
|
| 1026 |
+
with gr.Accordion("Sequence Viewer", open=False, visible=False) as seq_viewer_accordion:
|
| 1027 |
+
gr.Markdown("*Color scale: blue (low score) → yellow (medium) → red (high score). Hover over nucleotides for exact values.*")
|
| 1028 |
seq_viewer_html = gr.HTML(label="Color-coded sequence")
|
| 1029 |
regions_output = gr.JSON(label="Detected Regions", visible=False)
|
| 1030 |
|
|
|
|
| 1055 |
)
|
| 1056 |
|
| 1057 |
with gr.Tab("Embeddings"):
|
| 1058 |
+
gr.Markdown("""
|
| 1059 |
+
### Hidden State Analysis
|
| 1060 |
|
| 1061 |
+
Extract and visualize the model's internal representations (embeddings) from the transformer layers. The **State-Dynamics** mode applies UMAP dimensionality reduction to project the 768-dimensional embeddings into 2D/3D space, then performs agglomerative clustering to identify regions with similar activation patterns.
|
| 1062 |
|
| 1063 |
+
**Biological interpretation**: In CRISPR arrays, repeat sequences share conserved motifs and should cluster together, while unique spacer sequences form distinct clusters. This creates a characteristic alternating pattern in the sequence map visualization.
|
| 1064 |
""")
|
| 1065 |
with gr.Row():
|
| 1066 |
with gr.Column(scale=1):
|
| 1067 |
embed_seq = gr.Textbox(
|
| 1068 |
+
label="Input Sequence",
|
| 1069 |
placeholder="Paste DNA sequence...",
|
| 1070 |
lines=6,
|
| 1071 |
+
value=EMBEDDING_CRISPR_EXAMPLE,
|
| 1072 |
+
info="Longer sequences (>2000 bp) provide better clustering resolution."
|
| 1073 |
)
|
| 1074 |
embed_mode = gr.Radio(
|
| 1075 |
choices=["state-dynamics", "mean", "max", "trajectory"],
|
| 1076 |
value="state-dynamics",
|
| 1077 |
label="Visualization Mode",
|
| 1078 |
+
info="state-dynamics: UMAP clustering | mean/max: pooled embedding | trajectory: per-window heatmap"
|
| 1079 |
)
|
| 1080 |
use_3d = gr.Checkbox(
|
| 1081 |
+
label="3D UMAP Projection",
|
| 1082 |
value=False,
|
| 1083 |
+
info="Project embeddings to 3D space (interactive rotation)",
|
| 1084 |
visible=True
|
| 1085 |
)
|
| 1086 |
with gr.Row():
|
| 1087 |
+
embed_btn = gr.Button("Extract Embeddings", variant="primary")
|
| 1088 |
with gr.Row():
|
| 1089 |
+
gr.Button("CRISPR Example (3kb)").click(
|
| 1090 |
lambda: EMBEDDING_CRISPR_EXAMPLE, outputs=embed_seq
|
| 1091 |
)
|
| 1092 |
+
gr.Button("Control Sequence (3kb)").click(
|
| 1093 |
lambda: EMBEDDING_RANDOM_EXAMPLE, outputs=embed_seq
|
| 1094 |
)
|
| 1095 |
gr.Markdown("""
|
| 1096 |
+
**Example structure:** 600 bp upstream | CRISPR array (25 repeats + 24 spacers) | 600 bp downstream
|
|
|
|
|
|
|
|
|
|
| 1097 |
""")
|
| 1098 |
embed_summary = gr.Markdown()
|
| 1099 |
+
with gr.Accordion("Export Results", open=False, visible=False) as embed_download_accordion:
|
| 1100 |
with gr.Row():
|
| 1101 |
download_png = gr.File(label="PNG", interactive=False)
|
| 1102 |
download_pdf = gr.File(label="PDF", interactive=False)
|
| 1103 |
with gr.Column(scale=2):
|
| 1104 |
+
embed_plot = gr.Plot(label="Embedding Visualization")
|
| 1105 |
|
| 1106 |
# Show/hide 3D checkbox based on mode
|
| 1107 |
embed_mode.change(
|
|
|
|
| 1120 |
outputs=[embed_plot, embed_summary, download_png, download_pdf, embed_download_accordion]
|
| 1121 |
)
|
| 1122 |
|
| 1123 |
+
with gr.Tab("API"):
|
| 1124 |
+
gr.Markdown("""
|
| 1125 |
+
### Programmatic Access
|
| 1126 |
+
|
| 1127 |
+
This tool can be accessed programmatically using the Gradio Python client or via HTTP requests.
|
| 1128 |
+
|
| 1129 |
+
#### Python Client
|
| 1130 |
+
|
| 1131 |
+
```python
|
| 1132 |
+
from gradio_client import Client
|
| 1133 |
+
|
| 1134 |
+
# Connect to the API
|
| 1135 |
+
client = Client("genomenet/crispr-array-detection")
|
| 1136 |
+
|
| 1137 |
+
# Run prediction
|
| 1138 |
+
result = client.predict(
|
| 1139 |
+
sequence="ATGC...", # DNA sequence (min 1000 bp)
|
| 1140 |
+
stride=100, # Window stride in bp
|
| 1141 |
+
threshold=0.3, # Detection threshold
|
| 1142 |
+
api_name="/predict"
|
| 1143 |
+
)
|
| 1144 |
+
|
| 1145 |
+
# result contains: (plot, summary, regions, png_path, pdf_path, csv_path, summary_path, gff_path, seq_viewer_html)
|
| 1146 |
+
```
|
| 1147 |
+
|
| 1148 |
+
#### Extract Embeddings
|
| 1149 |
+
|
| 1150 |
+
```python
|
| 1151 |
+
result = client.predict(
|
| 1152 |
+
sequence="ATGC...",
|
| 1153 |
+
mode="state-dynamics", # or "mean", "max", "trajectory"
|
| 1154 |
+
use_3d=False,
|
| 1155 |
+
api_name="/get_embedding"
|
| 1156 |
+
)
|
| 1157 |
+
```
|
| 1158 |
+
|
| 1159 |
+
#### cURL Example
|
| 1160 |
+
|
| 1161 |
+
```bash
|
| 1162 |
+
curl -X POST "https://genomenet-crispr-array-detection.hf.space/api/predict" \\
|
| 1163 |
+
-H "Content-Type: application/json" \\
|
| 1164 |
+
-d '{"data": ["ATGCATGC...", 100, 0.3]}'
|
| 1165 |
+
```
|
| 1166 |
+
|
| 1167 |
+
#### Output Formats
|
| 1168 |
+
|
| 1169 |
+
| Format | Description |
|
| 1170 |
+
|--------|-------------|
|
| 1171 |
+
| CSV | Per-position scores: `position, probability, above_threshold` |
|
| 1172 |
+
| GFF3 | Standard genome annotation format for detected regions |
|
| 1173 |
+
| TXT | Human-readable summary with statistics |
|
| 1174 |
+
| PNG/PDF | Publication-ready figures |
|
| 1175 |
+
|
| 1176 |
+
#### Rate Limits
|
| 1177 |
+
|
| 1178 |
+
- Free tier: Standard HuggingFace rate limits apply
|
| 1179 |
+
- For high-throughput analysis, consider running the model locally
|
| 1180 |
+
|
| 1181 |
+
#### Local Installation
|
| 1182 |
+
|
| 1183 |
+
```bash
|
| 1184 |
+
git clone https://huggingface.co/spaces/genomenet/crispr-array-detection
|
| 1185 |
+
cd crispr-array-detection
|
| 1186 |
+
pip install -r requirements.txt
|
| 1187 |
+
python app.py
|
| 1188 |
+
```
|
| 1189 |
+
""")
|
| 1190 |
+
|
| 1191 |
with gr.Tab("About"):
|
| 1192 |
gr.Markdown("""
|
| 1193 |
+
### Model Architecture
|
| 1194 |
+
|
| 1195 |
+
| Component | Specification |
|
| 1196 |
+
|-----------|--------------|
|
| 1197 |
+
| Base model | BERT (Bidirectional Encoder Representations from Transformers) |
|
| 1198 |
+
| Layers | 24 transformer blocks |
|
| 1199 |
+
| Hidden size | 768 dimensions |
|
| 1200 |
+
| Attention heads | 12 |
|
| 1201 |
+
| Parameters | ~430 million |
|
| 1202 |
+
| Classification head | Bottleneck architecture |
|
| 1203 |
+
|
| 1204 |
+
### Training
|
| 1205 |
+
|
| 1206 |
+
**Pre-training corpus**: Metagenomic contigs and complete microbial genomes from public databases.
|
| 1207 |
+
|
| 1208 |
+
**Fine-tuning data**: Annotated CRISPR arrays from bacterial and archaeal genomes, including positive examples from CRISPRCasdb and negative examples from non-CRISPR genomic regions.
|
| 1209 |
+
|
| 1210 |
+
**Embedding extraction**: Hidden states are extracted from transformer layer 21 (768 dimensions per position).
|
| 1211 |
|
| 1212 |
+
### Parameters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1213 |
|
| 1214 |
+
| Parameter | Range | Default | Description |
|
| 1215 |
+
|-----------|-------|---------|-------------|
|
| 1216 |
+
| Stride | 50-500 bp | 100 bp | Step size between windows. Lower = higher resolution, more computation |
|
| 1217 |
+
| Threshold | 0.1-0.9 | 0.3 | Detection cutoff. Lower = more sensitive, higher = more specific |
|
| 1218 |
+
| Window size | Fixed | 1000 bp | Input window for the transformer model |
|
| 1219 |
|
| 1220 |
+
### Performance Considerations
|
|
|
|
|
|
|
|
|
|
| 1221 |
|
| 1222 |
+
- **GPU recommended**: T4 or better for interactive use
|
| 1223 |
+
- **CPU inference**: Functional but slower (~10-30s per analysis)
|
| 1224 |
+
- **Memory**: ~2GB GPU memory required
|
| 1225 |
|
| 1226 |
+
### Citation
|
|
|
|
|
|
|
| 1227 |
|
| 1228 |
+
If you use this tool in your research, please cite:
|
| 1229 |
|
| 1230 |
+
> Mu, Z. (2024). Deep Learning-Based CRISPR Array Detection. Master's Thesis, Helmholtz Centre for Infection Research.
|
| 1231 |
|
| 1232 |
+
### Acknowledgements
|
| 1233 |
|
| 1234 |
+
- Ziyu Mu - Model development (Master's Thesis, HZI BIFO)
|
| 1235 |
+
- DFG SPP 2141 "Much more than Defence" (Project MC 172)
|
| 1236 |
+
- BMBF de.NBI / GenomeNet
|
| 1237 |
+
- Helmholtz Centre for Infection Research (HZI)
|
| 1238 |
""")
|
| 1239 |
|
| 1240 |
|