genomenet Claude Opus 4.5 commited on
Commit
6b4e599
·
1 Parent(s): 297660c

Add API documentation, help tooltips, zoom controls, and professional text

Browse files

- Add API Documentation tab with Python client and cURL examples
- Add help tooltips to Stride and Threshold sliders explaining their function
- Add zoom controls (500bp, 1kb, 5kb, Full) using Plotly rangeselector
- Add minimap navigator using Plotly rangeslider for sequence overview
- Remove emojis throughout UI for professional appearance
- Update main description with technical/scientific language
- Update Embeddings tab with detailed mode descriptions
- Update About tab with architecture table, training details, parameter reference

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +214 -78
app.py CHANGED
@@ -98,18 +98,20 @@ def create_prediction_plot(positions, probabilities, threshold=0.3, regions=None
98
 
99
 
100
  def create_interactive_prediction_plot(positions, probabilities, threshold=0.3, regions=None):
101
- """Create an interactive Plotly figure showing the prediction curve."""
102
  fig = go.Figure()
103
 
 
 
104
  # Main probability curve with fill
105
  fig.add_trace(go.Scatter(
106
  x=positions,
107
  y=probabilities,
108
  mode='lines',
109
- name='CRISPR Score',
110
- line=dict(color='royalblue', width=1),
111
  fill='tozeroy',
112
- fillcolor='rgba(65, 105, 225, 0.2)',
113
  hovertemplate='Position: %{x:,} bp<br>Score: %{y:.3f}<extra></extra>'
114
  ))
115
 
@@ -117,9 +119,10 @@ def create_interactive_prediction_plot(positions, probabilities, threshold=0.3,
117
  fig.add_hline(
118
  y=threshold,
119
  line_dash="dash",
120
- line_color="red",
121
  annotation_text=f"Threshold ({threshold})",
122
- annotation_position="top right"
 
123
  )
124
 
125
  # Highlight detected CRISPR regions
@@ -127,25 +130,73 @@ def create_interactive_prediction_plot(positions, probabilities, threshold=0.3,
127
  for r in regions:
128
  fig.add_vrect(
129
  x0=r['start'], x1=r['end'],
130
- fillcolor="rgba(255, 0, 0, 0.15)",
131
  layer="below",
132
- line_width=0,
133
- annotation_text=f"Region {r['region_id']}",
 
134
  annotation_position="top left",
135
- annotation_font_size=10
 
136
  )
137
 
138
  fig.update_layout(
139
- title=dict(text='CRISPR Array Detection Score', font=dict(size=16)),
140
- xaxis_title='Position (bp)',
141
- yaxis_title='CRISPR Probability',
142
- yaxis=dict(range=[0, 1], gridcolor='lightgray'),
143
- xaxis=dict(range=[0, max(positions) if positions else 1000], gridcolor='lightgray'),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  hovermode='x unified',
145
  showlegend=True,
146
- legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
147
- height=400,
148
- plot_bgcolor='white'
 
 
 
 
 
 
 
 
149
  )
150
 
151
  return fig
@@ -908,67 +959,72 @@ with gr.Blocks(title="CRISPR Array Detection") as demo:
908
  gr.Markdown("""
909
  # CRISPR Array Detection
910
 
911
- Detect CRISPR arrays in DNA sequences using a BERT-based deep learning model (430M parameters), fine-tuned from a BERT model pre-trained on metagenomic and genomic microbial sequences.
 
 
912
 
913
- **How it works**: The model scans your sequence with a sliding window and outputs a probability score (0-1) for each position indicating likelihood of being part of a CRISPR array.
914
  """)
915
 
916
- with gr.Tab("Predict & Visualize"):
917
- gr.Markdown("Paste a DNA sequence or upload a FASTA file to get per-position CRISPR probability scores with interactive visualization.")
918
  with gr.Row():
919
  with gr.Column(scale=1):
920
  seq_input = gr.Textbox(
921
- label="DNA Sequence (min 1000 bp)",
922
- placeholder="Paste DNA sequence or FASTA...",
923
  lines=6,
924
- value=FLANKED_CRISPR_EXAMPLE
 
925
  )
926
  file_upload = gr.File(
927
- label="Or upload FASTA file",
928
  file_types=[".fasta", ".fa", ".fna", ".txt"],
929
  type="filepath"
930
  )
931
  with gr.Row():
932
  stride_input = gr.Slider(
933
  minimum=50, maximum=500, value=100, step=50,
934
- label="Stride"
 
935
  )
936
  threshold_input = gr.Slider(
937
  minimum=0.1, maximum=0.9, value=0.3, step=0.05,
938
- label="Threshold"
 
939
  )
940
  with gr.Row():
941
- predict_btn = gr.Button("🔬 Analyze Sequence", variant="primary", size="lg")
942
- gr.Markdown("**Load example:**")
943
  with gr.Row():
944
  gr.Button("Flanked CRISPR").click(
945
  lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
946
  )
947
- gr.Button("E. coli K-12").click(
948
  lambda: ECOLI_CRISPR_EXAMPLE, outputs=seq_input
949
  )
950
  with gr.Row():
951
- gr.Button("CRISPR Only").click(
952
  lambda: CRISPR_EXAMPLE, outputs=seq_input
953
  )
954
- gr.Button("Non-CRISPR").click(
955
  lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
956
  )
957
  result_summary = gr.Markdown()
958
- with gr.Accordion("📥 Downloads", open=False, visible=False) as download_accordion:
959
- gr.Markdown("**Plot exports:**")
960
  with gr.Row():
961
  pred_download_png = gr.File(label="PNG", interactive=False)
962
  pred_download_pdf = gr.File(label="PDF", interactive=False)
963
- gr.Markdown("**Data exports:**")
964
  with gr.Row():
965
  pred_download_csv = gr.File(label="CSV", interactive=False)
966
  pred_download_gff = gr.File(label="GFF3", interactive=False)
967
  with gr.Row():
968
  pred_download_summary = gr.File(label="Summary", interactive=False)
969
  with gr.Column(scale=2):
970
- plot_output = gr.Plot(label="CRISPR Score Profile (Interactive)")
971
- with gr.Accordion("🧬 Sequence Viewer", open=False, visible=False) as seq_viewer_accordion:
 
972
  seq_viewer_html = gr.HTML(label="Color-coded sequence")
973
  regions_output = gr.JSON(label="Detected Regions", visible=False)
974
 
@@ -999,54 +1055,53 @@ Detect CRISPR arrays in DNA sequences using a BERT-based deep learning model (43
999
  )
1000
 
1001
  with gr.Tab("Embeddings"):
1002
- gr.Markdown("""## State-Dynamic Plots
 
1003
 
1004
- Visualize how the model's internal representation changes across the sequence. The **State-Dynamics** mode projects embeddings to 2D/3D using UMAP and clusters similar regions together.
1005
 
1006
- **For CRISPR arrays**: Expect to see alternating colors in the bottom bar where repeats and spacers alternate. Repeats should cluster together (conserved pattern), while spacers cluster separately.
1007
  """)
1008
  with gr.Row():
1009
  with gr.Column(scale=1):
1010
  embed_seq = gr.Textbox(
1011
- label="DNA Sequence (longer sequences work better)",
1012
  placeholder="Paste DNA sequence...",
1013
  lines=6,
1014
- value=EMBEDDING_CRISPR_EXAMPLE
 
1015
  )
1016
  embed_mode = gr.Radio(
1017
  choices=["state-dynamics", "mean", "max", "trajectory"],
1018
  value="state-dynamics",
1019
  label="Visualization Mode",
1020
- info="state-dynamics: Interactive UMAP | mean/max: pooled heatmap | trajectory: per-window heatmap"
1021
  )
1022
  use_3d = gr.Checkbox(
1023
- label="3D Visualization",
1024
  value=False,
1025
- info="Enable 3D UMAP projection (drag to rotate)",
1026
  visible=True
1027
  )
1028
  with gr.Row():
1029
- embed_btn = gr.Button("Analyze Embeddings", variant="primary")
1030
  with gr.Row():
1031
- gr.Button("Load CRISPR Example (3kb)").click(
1032
  lambda: EMBEDDING_CRISPR_EXAMPLE, outputs=embed_seq
1033
  )
1034
- gr.Button("Load Random Example (3kb)").click(
1035
  lambda: EMBEDDING_RANDOM_EXAMPLE, outputs=embed_seq
1036
  )
1037
  gr.Markdown("""
1038
- **CRISPR Example Structure:**
1039
- - Upstream: 0-600 bp (random)
1040
- - Array: 600-2364 bp (25 repeats + 24 spacers)
1041
- - Downstream: 2364-2964 bp (random)
1042
  """)
1043
  embed_summary = gr.Markdown()
1044
- with gr.Accordion("Downloads", open=False, visible=False) as embed_download_accordion:
1045
  with gr.Row():
1046
  download_png = gr.File(label="PNG", interactive=False)
1047
  download_pdf = gr.File(label="PDF", interactive=False)
1048
  with gr.Column(scale=2):
1049
- embed_plot = gr.Plot(label="Embedding Visualization (Interactive)")
1050
 
1051
  # Show/hide 3D checkbox based on mode
1052
  embed_mode.change(
@@ -1065,40 +1120,121 @@ Visualize how the model's internal representation changes across the sequence. T
1065
  outputs=[embed_plot, embed_summary, download_png, download_pdf, embed_download_accordion]
1066
  )
1067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1068
  with gr.Tab("About"):
1069
  gr.Markdown("""
1070
- ## Model Details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1071
 
1072
- - **Architecture**: 24-layer BERT transformer with bottleneck classification head
1073
- - **Parameters**: ~430 million
1074
- - **Pre-training**: BERT model trained on metagenomic and genomic microbial sequences
1075
- - **Fine-tuning**: Trained on annotated CRISPR arrays from bacterial genomes
1076
- - **Input**: DNA sequence (1000 bp windows)
1077
- - **Output**: Per-position probability (0 = non-CRISPR, 1 = CRISPR)
1078
 
1079
- ## How to Use
 
 
 
 
1080
 
1081
- 1. **Paste your sequence** in the text box (minimum 1000 bp)
1082
- 2. **Click "Analyze Sequence"** to run the model
1083
- 3. **View the plot** showing CRISPR probability along the sequence
1084
- 4. Regions above the threshold (red) are predicted CRISPR arrays
1085
 
1086
- ## Tips
 
 
1087
 
1088
- - Use **lower stride** (50) for finer resolution, higher stride (200-500) for faster analysis
1089
- - Adjust **threshold** based on your needs: lower = more sensitive, higher = more specific
1090
- - The model works best on bacterial/archaeal genomic sequences
1091
 
1092
- ## Citation
1093
 
1094
- Based on [CRISPRArrayDetection](https://github.com/Ziyu-Mu/CRISPRArrayDetection) by Ziyu Mu
1095
 
1096
- ## Acknowledgements
1097
 
1098
- - Ziyu Mu, Master's Thesis, HZI BIFO
1099
- - DFG SPP 2141 (Geschäftszeichen MC 172)
1100
- - BMBF GenomeNet
1101
- - Helmholtz Centre for Infection Research
1102
  """)
1103
 
1104
 
 
98
 
99
 
100
  def create_interactive_prediction_plot(positions, probabilities, threshold=0.3, regions=None):
101
+ """Create an interactive Plotly figure showing the prediction curve with minimap."""
102
  fig = go.Figure()
103
 
104
+ max_pos = max(positions) if positions else 1000
105
+
106
  # Main probability curve with fill
107
  fig.add_trace(go.Scatter(
108
  x=positions,
109
  y=probabilities,
110
  mode='lines',
111
+ name='Prediction Score',
112
+ line=dict(color='#2563eb', width=1.5),
113
  fill='tozeroy',
114
+ fillcolor='rgba(37, 99, 235, 0.15)',
115
  hovertemplate='Position: %{x:,} bp<br>Score: %{y:.3f}<extra></extra>'
116
  ))
117
 
 
119
  fig.add_hline(
120
  y=threshold,
121
  line_dash="dash",
122
+ line_color="#dc2626",
123
  annotation_text=f"Threshold ({threshold})",
124
+ annotation_position="top right",
125
+ annotation_font_size=11
126
  )
127
 
128
  # Highlight detected CRISPR regions
 
130
  for r in regions:
131
  fig.add_vrect(
132
  x0=r['start'], x1=r['end'],
133
+ fillcolor="rgba(220, 38, 38, 0.12)",
134
  layer="below",
135
+ line_width=1,
136
+ line_color="rgba(220, 38, 38, 0.3)",
137
+ annotation_text=f"CRISPR {r['region_id']}",
138
  annotation_position="top left",
139
+ annotation_font_size=10,
140
+ annotation_font_color="#dc2626"
141
  )
142
 
143
  fig.update_layout(
144
+ title=dict(
145
+ text='CRISPR Array Detection',
146
+ font=dict(size=14, color='#1f2937'),
147
+ x=0.5,
148
+ xanchor='center'
149
+ ),
150
+ xaxis=dict(
151
+ title='Position (bp)',
152
+ range=[0, max_pos],
153
+ gridcolor='#e5e7eb',
154
+ showgrid=True,
155
+ zeroline=False,
156
+ # Rangeslider for minimap navigation
157
+ rangeslider=dict(
158
+ visible=True,
159
+ thickness=0.08,
160
+ bgcolor='#f3f4f6',
161
+ bordercolor='#d1d5db',
162
+ borderwidth=1
163
+ ),
164
+ # Range selector buttons for quick zoom
165
+ rangeselector=dict(
166
+ buttons=list([
167
+ dict(count=500, label="500bp", step="all", stepmode="backward"),
168
+ dict(count=1000, label="1kb", step="all", stepmode="backward"),
169
+ dict(count=5000, label="5kb", step="all", stepmode="backward"),
170
+ dict(step="all", label="Full")
171
+ ]),
172
+ bgcolor='#f9fafb',
173
+ bordercolor='#d1d5db',
174
+ font=dict(size=10),
175
+ x=0,
176
+ y=1.15
177
+ )
178
+ ),
179
+ yaxis=dict(
180
+ title='CRISPR Probability',
181
+ range=[0, 1.05],
182
+ gridcolor='#e5e7eb',
183
+ showgrid=True,
184
+ zeroline=False,
185
+ tickformat='.1f'
186
+ ),
187
  hovermode='x unified',
188
  showlegend=True,
189
+ legend=dict(
190
+ yanchor="top", y=0.99,
191
+ xanchor="right", x=0.99,
192
+ bgcolor='rgba(255,255,255,0.8)',
193
+ bordercolor='#e5e7eb',
194
+ borderwidth=1
195
+ ),
196
+ height=480,
197
+ plot_bgcolor='white',
198
+ paper_bgcolor='white',
199
+ margin=dict(t=80, b=60)
200
  )
201
 
202
  return fig
 
959
  gr.Markdown("""
960
  # CRISPR Array Detection
961
 
962
+ A deep learning approach for identifying CRISPR arrays in prokaryotic genome sequences. This tool employs a 24-layer BERT transformer architecture (~430M parameters) that was pre-trained on metagenomic contigs and complete microbial genomes, then fine-tuned on annotated CRISPR array sequences.
963
+
964
+ **Method**: Input sequences are processed using a sliding window approach (1000 bp window, configurable stride). For each window, the model outputs a probability score ∈ [0,1] indicating the likelihood that the central region contains part of a CRISPR array. Overlapping predictions are aggregated to produce per-position scores across the full sequence length.
965
 
966
+ **Output**: Detected CRISPR regions are reported with genomic coordinates, mean prediction scores, and can be exported in standard formats (GFF3, CSV) for downstream analysis.
967
  """)
968
 
969
+ with gr.Tab("Prediction"):
 
970
  with gr.Row():
971
  with gr.Column(scale=1):
972
  seq_input = gr.Textbox(
973
+ label="Input Sequence",
974
+ placeholder="Paste DNA sequence (FASTA format accepted)...",
975
  lines=6,
976
+ value=FLANKED_CRISPR_EXAMPLE,
977
+ info="Minimum length: 1000 bp. Accepts raw sequence or FASTA format."
978
  )
979
  file_upload = gr.File(
980
+ label="Upload FASTA File",
981
  file_types=[".fasta", ".fa", ".fna", ".txt"],
982
  type="filepath"
983
  )
984
  with gr.Row():
985
  stride_input = gr.Slider(
986
  minimum=50, maximum=500, value=100, step=50,
987
+ label="Stride (bp)",
988
+ info="Step size between consecutive windows. Lower values increase resolution but require more computation."
989
  )
990
  threshold_input = gr.Slider(
991
  minimum=0.1, maximum=0.9, value=0.3, step=0.05,
992
+ label="Detection Threshold",
993
+ info="Minimum score to classify a region as CRISPR. Lower = more sensitive, higher = more specific."
994
  )
995
  with gr.Row():
996
+ predict_btn = gr.Button("Run Analysis", variant="primary", size="lg")
997
+ gr.Markdown("**Example sequences:**")
998
  with gr.Row():
999
  gr.Button("Flanked CRISPR").click(
1000
  lambda: FLANKED_CRISPR_EXAMPLE, outputs=seq_input
1001
  )
1002
+ gr.Button("E. coli K-12 CRISPR I-E").click(
1003
  lambda: ECOLI_CRISPR_EXAMPLE, outputs=seq_input
1004
  )
1005
  with gr.Row():
1006
+ gr.Button("CRISPR Array").click(
1007
  lambda: CRISPR_EXAMPLE, outputs=seq_input
1008
  )
1009
+ gr.Button("Negative Control").click(
1010
  lambda: NON_CRISPR_EXAMPLE, outputs=seq_input
1011
  )
1012
  result_summary = gr.Markdown()
1013
+ with gr.Accordion("Export Results", open=False, visible=False) as download_accordion:
1014
+ gr.Markdown("**Figures:**")
1015
  with gr.Row():
1016
  pred_download_png = gr.File(label="PNG", interactive=False)
1017
  pred_download_pdf = gr.File(label="PDF", interactive=False)
1018
+ gr.Markdown("**Data:**")
1019
  with gr.Row():
1020
  pred_download_csv = gr.File(label="CSV", interactive=False)
1021
  pred_download_gff = gr.File(label="GFF3", interactive=False)
1022
  with gr.Row():
1023
  pred_download_summary = gr.File(label="Summary", interactive=False)
1024
  with gr.Column(scale=2):
1025
+ plot_output = gr.Plot(label="Prediction Score Profile")
1026
+ with gr.Accordion("Sequence Viewer", open=False, visible=False) as seq_viewer_accordion:
1027
+ gr.Markdown("*Color scale: blue (low score) → yellow (medium) → red (high score). Hover over nucleotides for exact values.*")
1028
  seq_viewer_html = gr.HTML(label="Color-coded sequence")
1029
  regions_output = gr.JSON(label="Detected Regions", visible=False)
1030
 
 
1055
  )
1056
 
1057
  with gr.Tab("Embeddings"):
1058
+ gr.Markdown("""
1059
+ ### Hidden State Analysis
1060
 
1061
+ Extract and visualize the model's internal representations (embeddings) from the transformer layers. The **State-Dynamics** mode applies UMAP dimensionality reduction to project the 768-dimensional embeddings into 2D/3D space, then performs agglomerative clustering to identify regions with similar activation patterns.
1062
 
1063
+ **Biological interpretation**: In CRISPR arrays, repeat sequences share conserved motifs and should cluster together, while unique spacer sequences form distinct clusters. This creates a characteristic alternating pattern in the sequence map visualization.
1064
  """)
1065
  with gr.Row():
1066
  with gr.Column(scale=1):
1067
  embed_seq = gr.Textbox(
1068
+ label="Input Sequence",
1069
  placeholder="Paste DNA sequence...",
1070
  lines=6,
1071
+ value=EMBEDDING_CRISPR_EXAMPLE,
1072
+ info="Longer sequences (>2000 bp) provide better clustering resolution."
1073
  )
1074
  embed_mode = gr.Radio(
1075
  choices=["state-dynamics", "mean", "max", "trajectory"],
1076
  value="state-dynamics",
1077
  label="Visualization Mode",
1078
+ info="state-dynamics: UMAP clustering | mean/max: pooled embedding | trajectory: per-window heatmap"
1079
  )
1080
  use_3d = gr.Checkbox(
1081
+ label="3D UMAP Projection",
1082
  value=False,
1083
+ info="Project embeddings to 3D space (interactive rotation)",
1084
  visible=True
1085
  )
1086
  with gr.Row():
1087
+ embed_btn = gr.Button("Extract Embeddings", variant="primary")
1088
  with gr.Row():
1089
+ gr.Button("CRISPR Example (3kb)").click(
1090
  lambda: EMBEDDING_CRISPR_EXAMPLE, outputs=embed_seq
1091
  )
1092
+ gr.Button("Control Sequence (3kb)").click(
1093
  lambda: EMBEDDING_RANDOM_EXAMPLE, outputs=embed_seq
1094
  )
1095
  gr.Markdown("""
1096
+ **Example structure:** 600 bp upstream | CRISPR array (25 repeats + 24 spacers) | 600 bp downstream
 
 
 
1097
  """)
1098
  embed_summary = gr.Markdown()
1099
+ with gr.Accordion("Export Results", open=False, visible=False) as embed_download_accordion:
1100
  with gr.Row():
1101
  download_png = gr.File(label="PNG", interactive=False)
1102
  download_pdf = gr.File(label="PDF", interactive=False)
1103
  with gr.Column(scale=2):
1104
+ embed_plot = gr.Plot(label="Embedding Visualization")
1105
 
1106
  # Show/hide 3D checkbox based on mode
1107
  embed_mode.change(
 
1120
  outputs=[embed_plot, embed_summary, download_png, download_pdf, embed_download_accordion]
1121
  )
1122
 
1123
+ with gr.Tab("API"):
1124
+ gr.Markdown("""
1125
+ ### Programmatic Access
1126
+
1127
+ This tool can be accessed programmatically using the Gradio Python client or via HTTP requests.
1128
+
1129
+ #### Python Client
1130
+
1131
+ ```python
1132
+ from gradio_client import Client
1133
+
1134
+ # Connect to the API
1135
+ client = Client("genomenet/crispr-array-detection")
1136
+
1137
+ # Run prediction
1138
+ result = client.predict(
1139
+ sequence="ATGC...", # DNA sequence (min 1000 bp)
1140
+ stride=100, # Window stride in bp
1141
+ threshold=0.3, # Detection threshold
1142
+ api_name="/predict"
1143
+ )
1144
+
1145
+ # result contains: (plot, summary, regions, png_path, pdf_path, csv_path, summary_path, gff_path, seq_viewer_html)
1146
+ ```
1147
+
1148
+ #### Extract Embeddings
1149
+
1150
+ ```python
1151
+ result = client.predict(
1152
+ sequence="ATGC...",
1153
+ mode="state-dynamics", # or "mean", "max", "trajectory"
1154
+ use_3d=False,
1155
+ api_name="/get_embedding"
1156
+ )
1157
+ ```
1158
+
1159
+ #### cURL Example
1160
+
1161
+ ```bash
1162
+ curl -X POST "https://genomenet-crispr-array-detection.hf.space/api/predict" \\
1163
+ -H "Content-Type: application/json" \\
1164
+ -d '{"data": ["ATGCATGC...", 100, 0.3]}'
1165
+ ```
1166
+
1167
+ #### Output Formats
1168
+
1169
+ | Format | Description |
1170
+ |--------|-------------|
1171
+ | CSV | Per-position scores: `position, probability, above_threshold` |
1172
+ | GFF3 | Standard genome annotation format for detected regions |
1173
+ | TXT | Human-readable summary with statistics |
1174
+ | PNG/PDF | Publication-ready figures |
1175
+
1176
+ #### Rate Limits
1177
+
1178
+ - Free tier: Standard HuggingFace rate limits apply
1179
+ - For high-throughput analysis, consider running the model locally
1180
+
1181
+ #### Local Installation
1182
+
1183
+ ```bash
1184
+ git clone https://huggingface.co/spaces/genomenet/crispr-array-detection
1185
+ cd crispr-array-detection
1186
+ pip install -r requirements.txt
1187
+ python app.py
1188
+ ```
1189
+ """)
1190
+
1191
  with gr.Tab("About"):
1192
  gr.Markdown("""
1193
+ ### Model Architecture
1194
+
1195
+ | Component | Specification |
1196
+ |-----------|--------------|
1197
+ | Base model | BERT (Bidirectional Encoder Representations from Transformers) |
1198
+ | Layers | 24 transformer blocks |
1199
+ | Hidden size | 768 dimensions |
1200
+ | Attention heads | 12 |
1201
+ | Parameters | ~430 million |
1202
+ | Classification head | Bottleneck architecture |
1203
+
1204
+ ### Training
1205
+
1206
+ **Pre-training corpus**: Metagenomic contigs and complete microbial genomes from public databases.
1207
+
1208
+ **Fine-tuning data**: Annotated CRISPR arrays from bacterial and archaeal genomes, including positive examples from CRISPRCasdb and negative examples from non-CRISPR genomic regions.
1209
+
1210
+ **Embedding extraction**: Hidden states are extracted from transformer layer 21 (768 dimensions per position).
1211
 
1212
+ ### Parameters
 
 
 
 
 
1213
 
1214
+ | Parameter | Range | Default | Description |
1215
+ |-----------|-------|---------|-------------|
1216
+ | Stride | 50-500 bp | 100 bp | Step size between windows. Lower = higher resolution, more computation |
1217
+ | Threshold | 0.1-0.9 | 0.3 | Detection cutoff. Lower = more sensitive, higher = more specific |
1218
+ | Window size | Fixed | 1000 bp | Input window for the transformer model |
1219
 
1220
+ ### Performance Considerations
 
 
 
1221
 
1222
+ - **GPU recommended**: T4 or better for interactive use
1223
+ - **CPU inference**: Functional but slower (~10-30s per analysis)
1224
+ - **Memory**: ~2GB GPU memory required
1225
 
1226
+ ### Citation
 
 
1227
 
1228
+ If you use this tool in your research, please cite:
1229
 
1230
+ > Mu, Z. (2024). Deep Learning-Based CRISPR Array Detection. Master's Thesis, Helmholtz Centre for Infection Research.
1231
 
1232
+ ### Acknowledgements
1233
 
1234
+ - Ziyu Mu - Model development (Master's Thesis, HZI BIFO)
1235
+ - DFG SPP 2141 "Much more than Defence" (Project MC 172)
1236
+ - BMBF de.NBI / GenomeNet
1237
+ - Helmholtz Centre for Infection Research (HZI)
1238
  """)
1239
 
1240