bernardo-de-almeida commited on
Commit
42f0385
·
1 Parent(s): 680e6ef

feat: add new tab

Browse files
README.md CHANGED
@@ -3,7 +3,7 @@ title: NTv3 — Foundation Models for Long-Range Genomics
3
  emoji: 🧬
4
  colorFrom: indigo
5
  colorTo: blue
6
- sdk: static
7
  pinned: false
8
  ---
9
 
 
3
  emoji: 🧬
4
  colorFrom: indigo
5
  colorTo: blue
6
+ sdk: gradio
7
  pinned: false
8
  ---
9
 
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main Gradio app entry point for NTv3 Space.
3
+ This file imports the track prediction demo from app_tracks.py.
4
+ """
5
+ from app_tracks import demo_interface
6
+
7
+ # For Hugging Face Spaces with Gradio SDK, the 'demo' variable must be named 'demo'
8
+ demo = demo_interface
9
+
10
+ if __name__ == "__main__":
11
+ demo.launch(server_name="0.0.0.0", share=False)
app_tracks.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app for NTv3 track prediction demo.
3
+ This module contains the interactive track prediction interface.
4
+ """
5
+ import gradio as gr
6
+ import torch
7
+ from transformers import pipeline
8
+ import os
9
+
10
+ # Initialize the pipeline (will be loaded on first use)
11
+ ntv3_tracks = None
12
+
13
+ def load_pipeline():
14
+ """Load the pipeline on first use (lazy loading)."""
15
+ global ntv3_tracks
16
+ if ntv3_tracks is None:
17
+ model_name = "InstaDeepAI/NTv3_650M_pos"
18
+ ntv3_tracks = pipeline(
19
+ "ntv3-tracks",
20
+ model=model_name,
21
+ trust_remote_code=True,
22
+ device=0 if torch.cuda.is_available() else -1,
23
+ )
24
+ return ntv3_tracks
25
+
26
+ def predict_tracks(chrom, start, end, species):
27
+ """Run track prediction on the specified genomic region."""
28
+ try:
29
+ # Validate inputs
30
+ if not chrom or not start or not end or not species:
31
+ return "❌ Please fill in all fields."
32
+
33
+ start = int(start)
34
+ end = int(end)
35
+
36
+ if start >= end:
37
+ return "❌ Start position must be less than end position."
38
+
39
+ if end - start > 1_000_000:
40
+ return "❌ Region size cannot exceed 1 Mb (1,000,000 bp)."
41
+
42
+ # Load pipeline
43
+ pipe = load_pipeline()
44
+
45
+ # Run prediction
46
+ out = pipe({
47
+ "chrom": chrom,
48
+ "start": start,
49
+ "end": end,
50
+ "species": species.lower()
51
+ })
52
+
53
+ # Format output
54
+ result = f"""✅ Prediction completed successfully!
55
+
56
+ 📊 Output Shapes:
57
+ • BigWig tracks logits: {tuple(out.bigwig_tracks_logits.shape)}
58
+ → {out.bigwig_tracks_logits.shape[1]} functional tracks over the center region
59
+
60
+ • BED tracks logits: {tuple(out.bed_tracks_logits.shape)}
61
+ → {out.bed_tracks_logits.shape[1]} genomic elements over the center region
62
+
63
+ • Language model logits: {tuple(out.mlm_logits.shape)}
64
+ → MLM predictions for the entire sequence
65
+
66
+ 📝 Note: Predictions are made over 37.5% of the center region of the input sequence.
67
+ """
68
+ return result
69
+
70
+ except Exception as e:
71
+ return f"❌ Error: {str(e)}"
72
+
73
+ # Create the track prediction demo interface (embedded in HTML)
74
+ def create_demo_interface():
75
+ """Create the Gradio interface for track prediction."""
76
+ with gr.Blocks(title="NTv3 Track Prediction Demo", theme=gr.themes.Soft()) as demo_interface:
77
+ gr.Markdown("""
78
+ # 🧬 NTv3 Interactive Track Prediction Demo
79
+
80
+ This demo allows you to run the NTv3 650M post-trained model to predict functional tracks and genomic elements for any genomic region.
81
+
82
+ **Model:** `InstaDeepAI/NTv3_650M_pos`
83
+ """)
84
+
85
+ with gr.Row():
86
+ with gr.Column():
87
+ chrom = gr.Textbox(
88
+ label="Chromosome",
89
+ placeholder="e.g., chr19",
90
+ value="chr19",
91
+ info="Chromosome name (e.g., chr1, chr19)"
92
+ )
93
+ start = gr.Number(
94
+ label="Start Position",
95
+ placeholder="e.g., 6700000",
96
+ value=6_700_000,
97
+ info="Start position in base pairs"
98
+ )
99
+ end = gr.Number(
100
+ label="End Position",
101
+ placeholder="e.g., 6831072",
102
+ value=6_831_072,
103
+ info="End position in base pairs"
104
+ )
105
+ species = gr.Dropdown(
106
+ label="Species",
107
+ choices=[
108
+ "human", "mouse", "rat", "chicken", "zebrafish",
109
+ "fruitfly", "worm", "yeast", "arabidopsis", "rice",
110
+ "maize", "soybean", "tomato", "potato", "grape",
111
+ "poplar", "medicago", "lotus", "brachypodium", "sorghum",
112
+ "barley", "wheat", "oats", "rye"
113
+ ],
114
+ value="human",
115
+ info="Select the species (24 supported species)"
116
+ )
117
+ predict_btn = gr.Button("🚀 Run Prediction", variant="primary")
118
+
119
+ with gr.Column():
120
+ output = gr.Textbox(
121
+ label="Results",
122
+ lines=15,
123
+ interactive=False,
124
+ placeholder="Results will appear here after running prediction..."
125
+ )
126
+
127
+ gr.Markdown("""
128
+ ### 📝 Notes:
129
+ - The model predicts ~7k functional tracks and 21 genomic elements
130
+ - Predictions are made over 37.5% of the center region of the input sequence
131
+ - Maximum region size: 1 Mb (1,000,000 base pairs)
132
+ - First run may take longer as the model loads
133
+ """)
134
+
135
+ predict_btn.click(
136
+ fn=predict_tracks,
137
+ inputs=[chrom, start, end, species],
138
+ outputs=output
139
+ )
140
+
141
+ gr.Examples(
142
+ examples=[
143
+ ["chr19", 6_700_000, 6_831_072, "human"],
144
+ ["chr1", 100_000, 200_000, "human"],
145
+ ["chr2", 50_000, 150_000, "mouse"],
146
+ ],
147
+ inputs=[chrom, start, end, species]
148
+ )
149
+
150
+ return demo_interface
151
+
152
+ # Create the demo interface
153
+ demo_interface = create_demo_interface()
154
+
155
+ # If running this file directly (for local testing)
156
+ if __name__ == "__main__":
157
+ demo_interface.launch(server_name="0.0.0.0", share=False)
158
+
index.html CHANGED
@@ -199,9 +199,56 @@
199
  border-radius: 12px;
200
  }
201
  .footer { margin-top: 22px; color: var(--muted); font-size: 13px; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  @media (max-width: 860px) {
203
  .card { grid-column: span 12; }
204
  h1 { font-size: 28px; }
 
 
 
 
205
  }
206
  </style>
207
  </head>
@@ -223,215 +270,124 @@
223
  </div>
224
  </div>
225
 
226
- <div class="summary">
227
- <h2>📖 About NTv3</h2>
228
- <p>
229
- NTv3 is a multi-species genomic foundation model family that unifies representation learning, functional-track prediction, genome annotation, and controllable sequence generation within a single U-Net-style backbone. It models up to 1 Mb of DNA at single-base resolution, using a conv–Transformer–deconv architecture that efficiently captures both local motifs and long-range regulatory dependencies. NTv3 is first pretrained on ~9T base pairs from the OpenGenome2 corpus spanning >128k species using masked language modeling, and then post-trained with a joint objective on ~16k functional tracks and annotation labels across 24 animal and plant species, enabling state-of-the-art cross-species functional prediction and base-resolution genome annotation.
230
- </p>
231
- <p>
232
- Beyond prediction, NTv3 can be fine-tuned into a controllable generative model via masked-diffusion language modeling, allowing targeted design of regulatory sequences (for example, enhancers with specified activity and promoter selectivity) that have been validated experimentally.
233
- </p>
234
  </div>
235
 
236
- <div class="paper-summary">
237
- <!-- <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2> -->
238
- <img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
239
  </div>
240
 
241
- <div class="why-ntv3">
242
- <h2>✨ Why NTv3?</h2>
243
- <ul>
244
- <li>📏 <strong>1 Mb long context at nucleotide resolution</strong> — ~100× longer than typical genomics models.</li>
245
- <li>🏗️ <strong>Unified architecture</strong> for: masked language modeling, functional-track prediction, genome annotation, and sequence generation.</li>
246
- <li>🌍 <strong>Cross-species generalization</strong> across 24 animals + plants with a shared conditioned representation space.</li>
247
- <li>⚡ <strong>U-Net–style architecture</strong> improves stability and GPU efficiency on very long sequences.</li>
248
- <li>🎯 <strong>Controllable generative modeling</strong>, enabling targeted enhancer/promoter engineering validated by experimental assays.</li>
249
- </ul>
250
- </div>
251
-
252
- <div class="grid">
253
- <div class="card">
254
- <h2>🤖 Models (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>)</h2>
255
- <ul>
256
- <li>📦 Pretrained checkpoints:
257
- <div style="margin-top: 8px; margin-left: 0;">
258
- <div><a href="https://huggingface.co/InstaDeepAI/NTv3_8M_pre"><code>InstaDeepAI/NTv3_8M_pre</code></a></div>
259
- <div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pre"><code>InstaDeepAI/NTv3_100M_pre</code></a></div>
260
- <div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pre"><code>InstaDeepAI/NTv3_650M_pre</code></a></div>
261
- </div>
262
- </li>
263
- <li>🎯 Post-trained checkpoints:
264
- <div style="margin-top: 8px; margin-left: 0;">
265
- <div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pos"><code>InstaDeepAI/NTv3_100M_pos</code></a></div>
266
- <div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pos"><code>InstaDeepAI/NTv3_650M_pos</code></a></div>
267
- </div>
268
- </li>
269
- </ul>
270
- <table>
271
- <thead>
272
- <tr>
273
- <th>Model</th>
274
- <th>Size</th>
275
- <th>Pre-training</th>
276
- <th>Post-training</th>
277
- <th>Tasks</th>
278
- </tr>
279
- </thead>
280
- <tbody>
281
- <tr>
282
- <td><strong>NTv3-8M</strong></td>
283
- <td>8M params</td>
284
- <td>MLM</td>
285
- <td>❌</td>
286
- <td>Embeddings, light inference</td>
287
- </tr>
288
- <tr>
289
- <td><strong>NTv3-100M</strong></td>
290
- <td>100M params</td>
291
- <td>MLM</td>
292
- <td><span class="checkmark">✅</span></td>
293
- <td>Tracks, annotation</td>
294
- </tr>
295
- <tr>
296
- <td><strong>NTv3-650M</strong></td>
297
- <td>650M params</td>
298
- <td>MLM</td>
299
- <td><span class="checkmark">✅</span></td>
300
- <td>Tracks, annotation, best accuracy</td>
301
- </tr>
302
- </tbody>
303
- </table>
304
- </div>
305
-
306
- <div class="card-stack">
307
- <div class="card">
308
- <h2>📓 Tutorial notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_tutorials" target="_blank" rel="noopener">folder</a>)</h2>
309
- <ul>
310
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/00_quickstart_inference.ipynb" target="_blank" rel="noopener">🚀 00 — Quickstart inference</a></li>
311
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb" target="_blank" rel="noopener">📊 01 — Tracks prediction</a></li>
312
- <li>🎯 02 — Fine-tune on bigwig tracks</li>
313
- <li>🔍 03 — Model interpretation</li>
314
- <li>🧪 04 — Training NTv3 generative </li>
315
- </ul>
316
  </div>
317
- <div class="card">
318
- <h2>📓 Pipelines notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_pipelines" target="_blank" rel="noopener">folder</a>)</h2>
319
- <ul>
320
- <li> 🎯 01 Generate bigwig predictions for certain tracks</li>
321
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/02_genome_annotation.ipynb" target="_blank" rel="noopener">🏷️ 02 — Genome annotation / segmentation</a></li>
322
- <li>🎯 03 — Fine-tune on bigwig tracks</li>
323
- <li>🔍 04 — Interpret a given genomic region</li>
324
- <li>🧪 05 — Sequence generation</li>
325
- </ul>
326
- </div>
327
- <div class="card">
328
- <h2>🔗 Links</h2>
329
- <ul>
330
- <li>📄 Paper: (add link)</li>
331
- <li><a href="https://github.com/instadeepai/nucleotide-transformer">💻 JAX model code (GitHub)</a></li>
332
- <li><a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">🎯 HF Model Collection (all NTv3 models)</a></li>
333
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">📓 All notebooks</a></li>
334
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3_benchmark" target="_blank" rel="noopener">🏆 NTv3 benchmark leaderboard</a></li>
335
- </ul>
336
- </div>
337
- </div>
338
 
339
- <div class="card">
340
- <h2>🤖 Load a pre-trained model</h2>
341
- <p>Here is an example of how to load and use a pre-trained NTv3 model.</p>
342
- <div class="code"><pre><code class="language-python">from transformers import AutoTokenizer, AutoModelForMaskedLM
 
 
 
 
 
 
 
 
343
 
344
- model_name = "InstaDeepAI/NTv3_650M_pre"
 
345
 
346
- # Load model and tokenizer
347
- model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)
348
- tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
 
 
349
 
350
- # Tokenize input sequences
351
- batch = tok(["ATCGNATCG", "ACGT"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")
 
 
 
 
352
 
353
- # Run model
354
- out = model(
355
- **batch,
356
- output_hidden_states=True,
357
- output_attentions=True
358
- )
 
 
 
 
 
 
 
359
 
360
- # Print output shapes
361
- print(out.logits.shape) # (B, L, V = 11)
362
- print(len(out.hidden_states)) # convs + transformers + deconvs
363
- print(len(out.attentions)) # equals transformer layers = 12
364
- </code></pre></div>
365
- <p>Model embeddings can be used for fine-tuning on downstream tasks.</p>
 
366
 
367
- <p style="margin-top: 40px;">TO DO: add pipeline for fine-tuning on functional tracks or genome annotation.</p>
368
- </div>
369
-
370
- <div class="card">
371
- <h2>💻 Use a post-trained model</h2>
372
- <p>Here is a quick example of how to use the post-trained NTv3 650M model to predict tracks for a human genomic window.</p>
373
- <div class="code"><pre><code class="language-python">from transformers import pipeline
374
- import torch
 
 
 
 
 
 
 
375
 
376
- model_name = "InstaDeepAI/NTv3_650M_pos"
 
 
 
377
 
378
- ntv3_tracks = pipeline(
379
- "ntv3-tracks",
380
- model=model_name,
381
- trust_remote_code=True,
382
- device=0 if torch.cuda.is_available() else -1,
383
- )
384
 
385
- # Run track prediction
386
- out = ntv3_tracks(
387
- {
388
- "chrom": "chr19",
389
- "start": 6_700_000,
390
- "end": 6_831_072,
391
- "species": "human"
392
- }
393
- )
394
 
395
- # Print output shapes
396
- # 7k human tracks over 37.5 % center region of the input sequence
397
- print("bigwig_tracks_logits:", tuple(out.bigwig_tracks_logits.shape))
398
- # Location of 21 genomic elements over 37.5 % center region of the input sequence
399
- print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape))
400
- # Language model logits for whole sequence over vocabulary
401
- print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div>
402
- <p>Predictions can also be plotted for a subset of functional tracks and genomic elements:</p>
403
- <div class="code"><pre><code class="language-python">tracks_to_plot = {
404
- "K562 RNA-seq": "ENCSR056HPM",
405
- "K562 DNAse": "ENCSR921NMD",
406
- "K562 H3k4me3": "ENCSR000DWD",
407
- "K562 CTCF": "ENCSR000AKO",
408
- "HepG2 RNA-seq": "ENCSR561FEE_P",
409
- "HepG2 DNAse": "ENCSR000EJV",
410
- "HepG2 H3k4me3": "ENCSR000AMP",
411
- "HepG2 CTCF": "ENCSR000BIE",
412
- }
413
- elements_to_plot = ["protein_coding_gene", "exon", "intron", "splice_donor", "splice_acceptor"]
414
 
415
- out = ntv3_tracks(
416
- {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072, "species": "human"},
417
- plot=True,
418
- tracks_to_plot=tracks_to_plot,
419
- elements_to_plot=elements_to_plot,
420
- )</code></pre></div>
421
- <img src="assets/output_tracks.png" alt="Output tracks visualization" style="max-width: 100%; margin-top: 20px;" />
422
- </div>
423
- </div>
424
-
425
- <!-- <div class="paper-summary">
426
- <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2>
427
- <img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
428
- </div> -->
429
 
430
- <p class="footer">
431
- © instadeep-ai — NTv3 companion Space.
432
- </p>
433
- </div>
434
- <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-core.min.js"></script>
435
- <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/autoloader/prism-autoloader.min.js"></script>
436
  </body>
437
  </html>
 
199
  border-radius: 12px;
200
  }
201
  .footer { margin-top: 22px; color: var(--muted); font-size: 13px; }
202
+
203
+ /* Tab navigation styles */
204
+ .tabs {
205
+ margin-top: 24px;
206
+ display: flex;
207
+ gap: 8px;
208
+ border-bottom: 2px solid var(--border);
209
+ overflow-x: auto;
210
+ }
211
+ .tab-button {
212
+ padding: 12px 20px;
213
+ background: transparent;
214
+ border: none;
215
+ border-bottom: 2px solid transparent;
216
+ color: var(--muted);
217
+ font-family: var(--sans);
218
+ font-size: 14px;
219
+ font-weight: 500;
220
+ cursor: pointer;
221
+ transition: all 0.2s ease;
222
+ white-space: nowrap;
223
+ margin-bottom: -2px;
224
+ }
225
+ .tab-button:hover {
226
+ color: var(--text);
227
+ background: rgba(255, 255, 255, 0.03);
228
+ }
229
+ .tab-button.active {
230
+ color: var(--link);
231
+ border-bottom-color: var(--link);
232
+ }
233
+ .tab-content {
234
+ display: none;
235
+ animation: fadeIn 0.3s ease;
236
+ }
237
+ .tab-content.active {
238
+ display: block;
239
+ }
240
+ @keyframes fadeIn {
241
+ from { opacity: 0; transform: translateY(8px); }
242
+ to { opacity: 1; transform: translateY(0); }
243
+ }
244
+
245
  @media (max-width: 860px) {
246
  .card { grid-column: span 12; }
247
  h1 { font-size: 28px; }
248
+ .tab-button {
249
+ padding: 10px 16px;
250
+ font-size: 13px;
251
+ }
252
  }
253
  </style>
254
  </head>
 
270
  </div>
271
  </div>
272
 
273
+ <!-- Tab Navigation -->
274
+ <div class="tabs">
275
+ <button class="tab-button active" data-tab="home">🏠 Home</button>
276
+ <button class="tab-button" data-tab="demo">💻 Code Demo</button>
 
 
 
 
277
  </div>
278
 
279
+ <!-- Home Tab (Content loaded from tabs/home.html) -->
280
+ <div id="home" class="tab-content active">
281
+ <!-- Content will be loaded dynamically -->
282
  </div>
283
 
284
+ <!-- Code Demo Tab (Content loaded from tabs/demo.html) -->
285
+ <div id="demo" class="tab-content">
286
+ <!-- Content will be loaded dynamically -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  </div>
288
+
289
+ <!-- <div class="paper-summary">
290
+ <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2>
291
+ <img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
292
+ </div> -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
+ <p class="footer">
295
+ © instadeep-ai NTv3 companion Space.
296
+ </p>
297
+ </div>
298
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-core.min.js"></script>
299
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/autoloader/prism-autoloader.min.js"></script>
300
+ <script>
301
+ // Tab content mapping
302
+ const tabFiles = {
303
+ 'home': 'tabs/home.html',
304
+ 'demo': 'tabs/demo.html'
305
+ };
306
 
307
+ // Cache for loaded tab content
308
+ const tabCache = {};
309
 
310
+ // Function to load tab content
311
+ async function loadTabContent(tabId) {
312
+ // Return cached content if available
313
+ if (tabCache[tabId]) {
314
+ return tabCache[tabId];
315
+ }
316
 
317
+ // Load content from file
318
+ const filePath = tabFiles[tabId];
319
+ if (!filePath) {
320
+ console.error(`No file path defined for tab: ${tabId}`);
321
+ return '';
322
+ }
323
 
324
+ try {
325
+ const response = await fetch(filePath);
326
+ if (!response.ok) {
327
+ throw new Error(`Failed to load ${filePath}: ${response.statusText}`);
328
+ }
329
+ const content = await response.text();
330
+ tabCache[tabId] = content;
331
+ return content;
332
+ } catch (error) {
333
+ console.error(`Error loading tab content for ${tabId}:`, error);
334
+ return `<div class="summary"><p>Error loading content. Please refresh the page.</p></div>`;
335
+ }
336
+ }
337
 
338
+ // Function to show a tab
339
+ async function showTab(tabId) {
340
+ const tabContent = document.getElementById(tabId);
341
+ if (!tabContent) {
342
+ console.error(`Tab element not found: ${tabId}`);
343
+ return;
344
+ }
345
 
346
+ // Load content if not already loaded
347
+ if (!tabContent.dataset.loaded) {
348
+ tabContent.innerHTML = await loadTabContent(tabId);
349
+ tabContent.dataset.loaded = 'true';
350
+
351
+ // Re-run Prism.js syntax highlighting for code blocks in the loaded content
352
+ if (typeof Prism !== 'undefined') {
353
+ // Find all code blocks in the loaded content and highlight them
354
+ const codeBlocks = tabContent.querySelectorAll('code[class*="language-"]');
355
+ codeBlocks.forEach(block => {
356
+ Prism.highlightElement(block);
357
+ });
358
+ }
359
+ }
360
+ }
361
 
362
+ // Tab switching functionality
363
+ document.addEventListener('DOMContentLoaded', function() {
364
+ const tabButtons = document.querySelectorAll('.tab-button');
365
+ const tabContents = document.querySelectorAll('.tab-content');
366
 
367
+ // Load the default active tab (home)
368
+ const activeTab = document.querySelector('.tab-content.active');
369
+ if (activeTab) {
370
+ showTab(activeTab.id);
371
+ }
 
372
 
373
+ tabButtons.forEach(button => {
374
+ button.addEventListener('click', async () => {
375
+ const targetTab = button.getAttribute('data-tab');
 
 
 
 
 
 
376
 
377
+ // Remove active class from all buttons and contents
378
+ tabButtons.forEach(btn => btn.classList.remove('active'));
379
+ tabContents.forEach(content => content.classList.remove('active'));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
+ // Add active class to clicked button and corresponding content
382
+ button.classList.add('active');
383
+ const tabElement = document.getElementById(targetTab);
384
+ tabElement.classList.add('active');
 
 
 
 
 
 
 
 
 
 
385
 
386
+ // Load and show the tab content
387
+ await showTab(targetTab);
388
+ });
389
+ });
390
+ });
391
+ </script>
392
  </body>
393
  </html>
notebooks_pipelines/02_genome_annotation.ipynb CHANGED
@@ -29,16 +29,7 @@
29
  "execution_count": 1,
30
  "id": "2e2f5963",
31
  "metadata": {},
32
- "outputs": [
33
- {
34
- "name": "stdout",
35
- "output_type": "stream",
36
- "text": [
37
- "\u001b[33mWARNING: 401 Error, Credentials not correct for https://gitlab.com/api/v4/projects/36813343/packages/pypi/simple/igv-notebook/\u001b[0m\u001b[33m\n",
38
- "\u001b[0m"
39
- ]
40
- }
41
- ],
42
  "source": [
43
  "# Install dependencies\n",
44
  "!pip -q install \"transformers>=4.55\" \"huggingface_hub>=0.23\" safetensors torch pyfaidx requests seaborn matplotlib igv_notebook"
@@ -127,28 +118,14 @@
127
  },
128
  {
129
  "cell_type": "code",
130
- "execution_count": null,
131
  "id": "4857d15c",
132
  "metadata": {},
133
  "outputs": [
134
  {
135
  "data": {
136
  "application/vnd.jupyter.widget-view+json": {
137
- "model_id": "92629742bf7e419b9aaad0c8c14867d5",
138
- "version_major": 2,
139
- "version_minor": 0
140
- },
141
- "text/plain": [
142
- "config.json: 0%| | 0.00/338k [00:00<?, ?B/s]"
143
- ]
144
- },
145
- "metadata": {},
146
- "output_type": "display_data"
147
- },
148
- {
149
- "data": {
150
- "application/vnd.jupyter.widget-view+json": {
151
- "model_id": "2468d781d0b7409791c5079ee9860a81",
152
  "version_major": 2,
153
  "version_minor": 0
154
  },
@@ -165,105 +142,15 @@
165
  "text": [
166
  "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/NTv3_650M_pos:\n",
167
  "- ntv3_gff_pipeline.py\n",
168
- ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
169
- ]
170
- },
171
- {
172
- "data": {
173
- "application/vnd.jupyter.widget-view+json": {
174
- "model_id": "fabadaa764ba4da799c0d43b12ac42b1",
175
- "version_major": 2,
176
- "version_minor": 0
177
- },
178
- "text/plain": [
179
- "model.safetensors: 0%| | 0.00/2.72G [00:00<?, ?B/s]"
180
- ]
181
- },
182
- "metadata": {},
183
- "output_type": "display_data"
184
- },
185
- {
186
- "data": {
187
- "application/vnd.jupyter.widget-view+json": {
188
- "model_id": "9188715aa52d48a2b54b6b89a015f1da",
189
- "version_major": 2,
190
- "version_minor": 0
191
- },
192
- "text/plain": [
193
- "tokenizer_config.json: 0%| | 0.00/1.47k [00:00<?, ?B/s]"
194
- ]
195
- },
196
- "metadata": {},
197
- "output_type": "display_data"
198
- },
199
- {
200
- "data": {
201
- "application/vnd.jupyter.widget-view+json": {
202
- "model_id": "f457beef2cdf4ecca076b589a95edf2b",
203
- "version_major": 2,
204
- "version_minor": 0
205
- },
206
- "text/plain": [
207
- "vocab.json: 0%| | 0.00/138 [00:00<?, ?B/s]"
208
- ]
209
- },
210
- "metadata": {},
211
- "output_type": "display_data"
212
- },
213
- {
214
- "data": {
215
- "application/vnd.jupyter.widget-view+json": {
216
- "model_id": "b900fc305af84983b820e385d239dc29",
217
- "version_major": 2,
218
- "version_minor": 0
219
- },
220
- "text/plain": [
221
- "special_tokens_map.json: 0%| | 0.00/149 [00:00<?, ?B/s]"
222
- ]
223
- },
224
- "metadata": {},
225
- "output_type": "display_data"
226
- },
227
- {
228
- "name": "stderr",
229
- "output_type": "stream",
230
- "text": [
231
  "Device set to use cpu\n"
232
  ]
233
  },
234
- {
235
- "data": {
236
- "application/vnd.jupyter.widget-view+json": {
237
- "model_id": "7fbd5f15218142e1b4a14474e96189b8",
238
- "version_major": 2,
239
- "version_minor": 0
240
- },
241
- "text/plain": [
242
- "tokenizer_config.json: 0%| | 0.00/1.49k [00:00<?, ?B/s]"
243
- ]
244
- },
245
- "metadata": {},
246
- "output_type": "display_data"
247
- },
248
- {
249
- "data": {
250
- "application/vnd.jupyter.widget-view+json": {
251
- "model_id": "c3ce29bcfd6b4f0681b0ac94809ef9ab",
252
- "version_major": 2,
253
- "version_minor": 0
254
- },
255
- "text/plain": [
256
- "vocab.json: 0%| | 0.00/693 [00:00<?, ?B/s]"
257
- ]
258
- },
259
- "metadata": {},
260
- "output_type": "display_data"
261
- },
262
  {
263
  "name": "stdout",
264
  "output_type": "stream",
265
  "text": [
266
- "Inference + decoding time: 47.49 seconds\n"
267
  ]
268
  }
269
  ],
@@ -302,7 +189,7 @@
302
  },
303
  {
304
  "cell_type": "code",
305
- "execution_count": 7,
306
  "id": "959cf79f",
307
  "metadata": {},
308
  "outputs": [
@@ -336,7 +223,7 @@
336
  },
337
  {
338
  "cell_type": "code",
339
- "execution_count": 8,
340
  "id": "84f013f6",
341
  "metadata": {},
342
  "outputs": [
@@ -379,14 +266,14 @@
379
  },
380
  {
381
  "cell_type": "code",
382
- "execution_count": 9,
383
  "id": "0904a5cb",
384
  "metadata": {},
385
  "outputs": [
386
  {
387
  "data": {
388
  "text/html": [
389
- "<div id=\"jb_2472686_buttons\"></div><div id=\"jb_2472686_igvcontainer\"></div>"
390
  ],
391
  "text/plain": [
392
  "<IPython.core.display.HTML object>"
@@ -397,7 +284,7 @@
397
  },
398
  {
399
  "data": {
400
- "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_2472686\", \"command\": \"createBrowser\", \"data\": {\"genome\": \"hg38\", \"locus\": \"chr19:6700000-6831072\", \"id\": \"jb_2472686\"}})",
401
  "text/plain": [
402
  "<IPython.core.display.Javascript object>"
403
  ]
@@ -407,7 +294,7 @@
407
  },
408
  {
409
  "data": {
410
- "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_2472686\", \"command\": \"loadTrack\", \"data\": {\"name\": \"NTv3 annotations\", \"format\": \"gff3\", \"type\": \"annotation\", \"url\": \"NTv3_650M_pos_hg38_chr19_6700000_6831072.gff3\"}})",
411
  "text/plain": [
412
  "<IPython.core.display.Javascript object>"
413
  ]
@@ -417,7 +304,7 @@
417
  },
418
  {
419
  "data": {
420
- "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_2472686\", \"command\": \"search\", \"data\": \"chr19:6700000-6831072\"})",
421
  "text/plain": [
422
  "<IPython.core.display.Javascript object>"
423
  ]
@@ -428,10 +315,10 @@
428
  {
429
  "data": {
430
  "text/plain": [
431
- "<igv_notebook.browser.Browser at 0x1047ec880>"
432
  ]
433
  },
434
- "execution_count": 9,
435
  "metadata": {},
436
  "output_type": "execute_result"
437
  }
 
29
  "execution_count": 1,
30
  "id": "2e2f5963",
31
  "metadata": {},
32
+ "outputs": [],
 
 
 
 
 
 
 
 
 
33
  "source": [
34
  "# Install dependencies\n",
35
  "!pip -q install \"transformers>=4.55\" \"huggingface_hub>=0.23\" safetensors torch pyfaidx requests seaborn matplotlib igv_notebook"
 
118
  },
119
  {
120
  "cell_type": "code",
121
+ "execution_count": 5,
122
  "id": "4857d15c",
123
  "metadata": {},
124
  "outputs": [
125
  {
126
  "data": {
127
  "application/vnd.jupyter.widget-view+json": {
128
+ "model_id": "cead875ae8c34250b6929e22283652e1",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  "version_major": 2,
130
  "version_minor": 0
131
  },
 
142
  "text": [
143
  "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/NTv3_650M_pos:\n",
144
  "- ntv3_gff_pipeline.py\n",
145
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  "Device set to use cpu\n"
147
  ]
148
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  {
150
  "name": "stdout",
151
  "output_type": "stream",
152
  "text": [
153
+ "Inference + decoding time: 53.09 seconds\n"
154
  ]
155
  }
156
  ],
 
189
  },
190
  {
191
  "cell_type": "code",
192
+ "execution_count": 6,
193
  "id": "959cf79f",
194
  "metadata": {},
195
  "outputs": [
 
223
  },
224
  {
225
  "cell_type": "code",
226
+ "execution_count": 7,
227
  "id": "84f013f6",
228
  "metadata": {},
229
  "outputs": [
 
266
  },
267
  {
268
  "cell_type": "code",
269
+ "execution_count": 8,
270
  "id": "0904a5cb",
271
  "metadata": {},
272
  "outputs": [
273
  {
274
  "data": {
275
  "text/html": [
276
+ "<div id=\"jb_471625_buttons\"></div><div id=\"jb_471625_igvcontainer\"></div>"
277
  ],
278
  "text/plain": [
279
  "<IPython.core.display.HTML object>"
 
284
  },
285
  {
286
  "data": {
287
+ "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_471625\", \"command\": \"createBrowser\", \"data\": {\"genome\": \"hg38\", \"locus\": \"chr19:6700000-6831072\", \"id\": \"jb_471625\"}})",
288
  "text/plain": [
289
  "<IPython.core.display.Javascript object>"
290
  ]
 
294
  },
295
  {
296
  "data": {
297
+ "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_471625\", \"command\": \"loadTrack\", \"data\": {\"name\": \"NTv3 annotations\", \"format\": \"gff3\", \"type\": \"annotation\", \"url\": \"NTv3_650M_pos_hg38_chr19_6700000_6831072.gff3\"}})",
298
  "text/plain": [
299
  "<IPython.core.display.Javascript object>"
300
  ]
 
304
  },
305
  {
306
  "data": {
307
+ "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_471625\", \"command\": \"search\", \"data\": \"chr19:6700000-6831072\"})",
308
  "text/plain": [
309
  "<IPython.core.display.Javascript object>"
310
  ]
 
315
  {
316
  "data": {
317
  "text/plain": [
318
+ "<igv_notebook.browser.Browser at 0x30d4e3e50>"
319
  ]
320
  },
321
+ "execution_count": 8,
322
  "metadata": {},
323
  "output_type": "execute_result"
324
  }
notebooks_pipelines/NTv3_650M_pos_hg38_chr19_6700000_6831072.gff3 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##gff-version 3
2
+ # model: InstaDeepAI/NTv3_650M_pos
3
+ # window: chr19:6700000-6831072 (hg38); predictions on central 37.5%: chr19:6740960-6790112
4
+ chr19 NTv3_HMM intron 6740961 6740995 0.975 . . ID=INTRON_1;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
5
+ chr19 NTv3_HMM start_codon 6740996 6741013 0.355 . . ID=START_CODON_2;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
6
+ chr19 NTv3_HMM exon 6741014 6741124 0.673 . . ID=EXON_3;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
7
+ chr19 NTv3_HMM splice_donor_site 6741125 6741125 0.857 . . ID=SPLICE_DONOR_4;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
8
+ chr19 NTv3_HMM intron 6741126 6741224 0.974 . . ID=INTRON_5;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
9
+ chr19 NTv3_HMM splice_acceptor_site 6741225 6741225 0.930 . . ID=SPLICE_ACCEPTOR_6;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
10
+ chr19 NTv3_HMM exon 6741226 6741280 0.693 . . ID=EXON_7;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
11
+ chr19 NTv3_HMM splice_donor_site 6741281 6741281 0.837 . . ID=SPLICE_DONOR_8;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
12
+ chr19 NTv3_HMM intron 6741282 6742966 0.959 . . ID=INTRON_9;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
13
+ chr19 NTv3_HMM splice_acceptor_site 6742967 6742967 0.958 . . ID=SPLICE_ACCEPTOR_10;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
14
+ chr19 NTv3_HMM exon 6742968 6743113 0.841 . . ID=EXON_11;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
15
+ chr19 NTv3_HMM splice_donor_site 6743114 6743114 0.779 . . ID=SPLICE_DONOR_12;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
16
+ chr19 NTv3_HMM intron 6743115 6743193 0.963 . . ID=INTRON_13;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
17
+ chr19 NTv3_HMM splice_acceptor_site 6743194 6743194 0.910 . . ID=SPLICE_ACCEPTOR_14;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
18
+ chr19 NTv3_HMM exon 6743195 6743255 0.845 . . ID=EXON_15;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
19
+ chr19 NTv3_HMM splice_donor_site 6743256 6743256 0.782 . . ID=SPLICE_DONOR_16;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
20
+ chr19 NTv3_HMM intron 6743257 6743493 0.970 . . ID=INTRON_17;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
21
+ chr19 NTv3_HMM splice_acceptor_site 6743494 6743494 0.780 . . ID=SPLICE_ACCEPTOR_18;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
22
+ chr19 NTv3_HMM exon 6743495 6743597 0.876 . . ID=EXON_19;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
23
+ chr19 NTv3_HMM splice_donor_site 6743598 6743598 0.856 . . ID=SPLICE_DONOR_20;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
24
+ chr19 NTv3_HMM intron 6743599 6743707 0.951 . . ID=INTRON_21;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
25
+ chr19 NTv3_HMM splice_acceptor_site 6743708 6743708 0.856 . . ID=SPLICE_ACCEPTOR_22;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
26
+ chr19 NTv3_HMM exon 6743709 6743835 0.812 . . ID=EXON_23;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
27
+ chr19 NTv3_HMM splice_donor_site 6743836 6743836 0.887 . . ID=SPLICE_DONOR_24;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
28
+ chr19 NTv3_HMM intron 6743837 6744553 0.989 . . ID=INTRON_25;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
29
+ chr19 NTv3_HMM splice_acceptor_site 6744554 6744554 0.972 . . ID=SPLICE_ACCEPTOR_26;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
30
+ chr19 NTv3_HMM exon 6744555 6744700 0.977 . . ID=EXON_27;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
31
+ chr19 NTv3_HMM intron 6744701 6744799 0.972 . . ID=INTRON_28;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
32
+ chr19 NTv3_HMM splice_acceptor_site 6744800 6744800 0.954 . . ID=SPLICE_ACCEPTOR_29;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
33
+ chr19 NTv3_HMM exon 6744801 6744993 0.977 . . ID=EXON_30;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
34
+ chr19 NTv3_HMM splice_donor_site 6744994 6744994 0.886 . . ID=SPLICE_DONOR_31;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
35
+ chr19 NTv3_HMM intron 6744995 6746451 0.979 . . ID=INTRON_32;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
36
+ chr19 NTv3_HMM splice_acceptor_site 6746452 6746452 0.938 . . ID=SPLICE_ACCEPTOR_33;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
37
+ chr19 NTv3_HMM exon 6746453 6746560 0.840 . . ID=EXON_34;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
38
+ chr19 NTv3_HMM splice_donor_site 6746561 6746561 0.947 . . ID=SPLICE_DONOR_35;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
39
+ chr19 NTv3_HMM intron 6746562 6749933 0.973 . . ID=INTRON_36;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
40
+ chr19 NTv3_HMM splice_acceptor_site 6749934 6749934 0.693 . . ID=SPLICE_ACCEPTOR_37;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
41
+ chr19 NTv3_HMM exon 6749935 6750065 0.918 . . ID=EXON_38;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
42
+ chr19 NTv3_HMM splice_donor_site 6750066 6750066 0.783 . . ID=SPLICE_DONOR_39;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
43
+ chr19 NTv3_HMM intron 6750067 6750291 0.955 . . ID=INTRON_40;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
44
+ chr19 NTv3_HMM splice_acceptor_site 6750292 6750292 0.960 . . ID=SPLICE_ACCEPTOR_41;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
45
+ chr19 NTv3_HMM exon 6750293 6750430 0.959 . . ID=EXON_42;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
46
+ chr19 NTv3_HMM splice_donor_site 6750431 6750431 0.723 . . ID=SPLICE_DONOR_43;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
47
+ chr19 NTv3_HMM intron 6750432 6750511 0.939 . . ID=INTRON_44;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
48
+ chr19 NTv3_HMM splice_acceptor_site 6750512 6750512 0.750 . . ID=SPLICE_ACCEPTOR_45;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
49
+ chr19 NTv3_HMM exon 6750513 6750632 0.902 . . ID=EXON_46;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
50
+ chr19 NTv3_HMM splice_donor_site 6750633 6750633 0.917 . . ID=SPLICE_DONOR_47;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
51
+ chr19 NTv3_HMM intron 6750634 6751062 0.961 . . ID=INTRON_48;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
52
+ chr19 NTv3_HMM splice_acceptor_site 6751063 6751063 0.694 . . ID=SPLICE_ACCEPTOR_49;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
53
+ chr19 NTv3_HMM exon 6751064 6751199 0.558 . . ID=EXON_50;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
54
+ chr19 NTv3_HMM stop_codon 6751200 6751212 0.332 . . ID=STOP_CODON_51;Name=STOP_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=220,20,60
55
+ chr19 NTv3_HMM three_prime_UTR 6751213 6751488 0.965 + . ID=UTR3_PLUS_52;Name=UTR3_PLUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=34,139,34
56
+ chr19 NTv3_HMM polyA_signal 6751489 6751507 0.355 . . ID=POLYA_SIGNAL_53;Name=POLYA_SIGNAL;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=139,69,19
57
+ chr19 NTv3_HMM start_codon 6751508 6752169 0.002 . . ID=START_CODON_54;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
58
+ chr19 NTv3_HMM polyA_signal 6752170 6752187 0.432 . . ID=POLYA_SIGNAL_55;Name=POLYA_SIGNAL;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=139,69,19
59
+ chr19 NTv3_HMM three_prime_UTR 6752188 6752571 0.839 - . ID=UTR3_MINUS_56;Name=UTR3_MINUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=34,139,34
60
+ chr19 NTv3_HMM stop_codon 6752572 6752752 0.136 . . ID=STOP_CODON_57;Name=STOP_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=220,20,60
61
+ chr19 NTv3_HMM splice_acceptor_site 6752753 6752753 0.798 . . ID=SPLICE_ACCEPTOR_58;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
62
+ chr19 NTv3_HMM intron 6752754 6753455 0.910 . . ID=INTRON_59;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
63
+ chr19 NTv3_HMM splice_donor_site 6753456 6753456 0.766 . . ID=SPLICE_DONOR_60;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
64
+ chr19 NTv3_HMM exon 6753457 6753640 0.953 . . ID=EXON_61;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
65
+ chr19 NTv3_HMM splice_acceptor_site 6753641 6753641 0.939 . . ID=SPLICE_ACCEPTOR_62;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
66
+ chr19 NTv3_HMM intron 6753642 6754051 0.985 . . ID=INTRON_63;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
67
+ chr19 NTv3_HMM splice_donor_site 6754052 6754052 0.844 . . ID=SPLICE_DONOR_64;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
68
+ chr19 NTv3_HMM exon 6754053 6754161 0.908 . . ID=EXON_65;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
69
+ chr19 NTv3_HMM splice_acceptor_site 6754162 6754163 0.633 . . ID=SPLICE_ACCEPTOR_66;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
70
+ chr19 NTv3_HMM intron 6754164 6754250 0.962 . . ID=INTRON_67;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
71
+ chr19 NTv3_HMM splice_donor_site 6754251 6754251 0.875 . . ID=SPLICE_DONOR_68;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
72
+ chr19 NTv3_HMM exon 6754252 6754424 0.965 . . ID=EXON_69;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
73
+ chr19 NTv3_HMM splice_acceptor_site 6754425 6754425 0.791 . . ID=SPLICE_ACCEPTOR_70;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
74
+ chr19 NTv3_HMM intron 6754426 6754615 0.975 . . ID=INTRON_71;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
75
+ chr19 NTv3_HMM splice_donor_site 6754616 6754616 0.953 . . ID=SPLICE_DONOR_72;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
76
+ chr19 NTv3_HMM exon 6754617 6754730 0.731 . . ID=EXON_73;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
77
+ chr19 NTv3_HMM splice_acceptor_site 6754731 6754731 0.822 . . ID=SPLICE_ACCEPTOR_74;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
78
+ chr19 NTv3_HMM intron 6754732 6754830 0.975 . . ID=INTRON_75;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
79
+ chr19 NTv3_HMM splice_donor_site 6754831 6754831 0.944 . . ID=SPLICE_DONOR_76;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
80
+ chr19 NTv3_HMM exon 6754832 6755314 0.757 . . ID=EXON_77;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
81
+ chr19 NTv3_HMM splice_acceptor_site 6755315 6755315 0.713 . . ID=SPLICE_ACCEPTOR_78;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
82
+ chr19 NTv3_HMM intron 6755316 6759593 0.988 . . ID=INTRON_79;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
83
+ chr19 NTv3_HMM splice_donor_site 6759594 6759594 0.928 . . ID=SPLICE_DONOR_80;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
84
+ chr19 NTv3_HMM exon 6759595 6759669 0.840 . . ID=EXON_81;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
85
+ chr19 NTv3_HMM splice_acceptor_site 6759670 6759670 0.901 . . ID=SPLICE_ACCEPTOR_82;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
86
+ chr19 NTv3_HMM intron 6759671 6760637 0.985 . . ID=INTRON_83;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
87
+ chr19 NTv3_HMM splice_donor_site 6760638 6760638 0.928 . . ID=SPLICE_DONOR_84;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
88
+ chr19 NTv3_HMM exon 6760639 6760985 0.748 . . ID=EXON_85;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
89
+ chr19 NTv3_HMM splice_acceptor_site 6760986 6760987 0.603 . . ID=SPLICE_ACCEPTOR_86;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
90
+ chr19 NTv3_HMM intron 6760988 6763679 0.984 . . ID=INTRON_87;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
91
+ chr19 NTv3_HMM splice_donor_site 6763680 6763680 0.759 . . ID=SPLICE_DONOR_88;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
92
+ chr19 NTv3_HMM exon 6763681 6763732 0.663 . . ID=EXON_89;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
93
+ chr19 NTv3_HMM five_prime_UTR 6763733 6763815 0.840 - . ID=UTR5_MINUS_90;Name=UTR5_MINUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
94
+ chr19 NTv3_HMM splice_acceptor_site 6763816 6763816 0.869 . . ID=SPLICE_ACCEPTOR_91;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
95
+ chr19 NTv3_HMM intron 6763817 6767386 0.976 . . ID=INTRON_92;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
96
+ chr19 NTv3_HMM splice_donor_site 6767387 6767387 0.902 . . ID=SPLICE_DONOR_93;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
97
+ chr19 NTv3_HMM start_codon 6767388 6767411 0.051 . . ID=START_CODON_94;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
98
+ chr19 NTv3_HMM five_prime_UTR 6767412 6767514 0.578 - . ID=UTR5_MINUS_95;Name=UTR5_MINUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
99
+ chr19 NTv3_HMM start_codon 6767515 6769347 0.009 . . ID=START_CODON_96;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
100
+ chr19 NTv3_HMM TF_binding_site 6769348 6769521 0.506 . . ID=CTCF_97;Name=CTCF;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=148,0,211
101
+ chr19 NTv3_HMM start_codon 6769522 6772696 0.002 . . ID=START_CODON_98;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
102
+ chr19 NTv3_HMM five_prime_UTR 6772697 6772806 0.885 + . ID=UTR5_PLUS_99;Name=UTR5_PLUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
103
+ chr19 NTv3_HMM start_codon 6772807 6772810 0.694 . . ID=START_CODON_100;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
104
+ chr19 NTv3_HMM five_prime_UTR 6772811 6772922 0.748 + . ID=UTR5_PLUS_101;Name=UTR5_PLUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
105
+ chr19 NTv3_HMM exon 6772923 6773010 0.635 . . ID=EXON_102;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
106
+ chr19 NTv3_HMM splice_donor_site 6773011 6773011 0.884 . . ID=SPLICE_DONOR_103;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
107
+ chr19 NTv3_HMM intron 6773012 6790112 0.972 . . ID=INTRON_104;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ torch>=2.0.0
3
+ transformers>=4.55.0
4
+ accelerate>=0.20.0
5
+ safetensors>=0.3.0
6
+ huggingface_hub>=0.23.0
7
+
tabs/demo.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="summary">
2
+ <h2>💻 Interactive Code Demo</h2>
3
+ <p>Run the NTv3 650M post-trained model interactively to predict functional tracks and genomic elements for any genomic region.</p>
4
+ <p><strong>Model:</strong> <code>InstaDeepAI/NTv3_650M_pos</code></p>
5
+ </div>
6
+
7
+ <div class="grid">
8
+ <div class="card" style="grid-column: span 12;">
9
+ <h2>🚀 NTv3 Track Prediction Pipeline</h2>
10
+ <p>Enter a genomic region to get predictions for functional tracks and genomic elements. The model will predict ~7k functional tracks and 21 genomic elements over the center 37.5% of your input region.</p>
11
+
12
+ <!-- Gradio app embedded here -->
13
+ <!-- Note: With Gradio SDK, the app.py serves as the main interface -->
14
+ <!-- The HTML interface can still be accessed, but the Gradio demo is the primary interface -->
15
+ <div id="gradio-container" style="margin-top: 20px; min-height: 600px;">
16
+ <p style="color: var(--muted); margin-bottom: 15px;">
17
+ <strong>Note:</strong> With Gradio SDK enabled, the interactive demo is now the main interface of this Space.
18
+ You can interact with it directly, or use the code example below to run predictions programmatically.
19
+ </p>
20
+ <div style="background: rgba(0,0,0,0.3); padding: 20px; border-radius: 12px; border: 1px solid var(--border);">
21
+ <p style="color: var(--link); margin: 0;">
22
+ 💡 The Gradio interactive demo is now available as the main interface of this Space.
23
+ Refresh the page to see it, or use the code example below.
24
+ </p>
25
+ </div>
26
+ </div>
27
+
28
+ <p style="margin-top: 20px; color: var(--muted); font-size: 13px;">
29
+ <strong>Note:</strong> The first run may take longer as the model loads. Maximum region size: 1 Mb (1,000,000 base pairs).
30
+ </p>
31
+ </div>
32
+
33
+ <div class="card" style="grid-column: span 12;">
34
+ <h2>📝 Code Example</h2>
35
+ <p>Here's the Python code that powers the demo above. You can run this in a notebook or Python script:</p>
36
+ <div class="code"><pre><code class="language-python">from transformers import pipeline
37
+ import torch
38
+
39
+ model_name = "InstaDeepAI/NTv3_650M_pos"
40
+
41
+ ntv3_tracks = pipeline(
42
+ "ntv3-tracks",
43
+ model=model_name,
44
+ trust_remote_code=True,
45
+ device=0 if torch.cuda.is_available() else -1,
46
+ )
47
+
48
+ # Run track prediction
49
+ out = ntv3_tracks(
50
+ {
51
+ "chrom": "chr19",
52
+ "start": 6_700_000,
53
+ "end": 6_831_072,
54
+ "species": "human"
55
+ }
56
+ )
57
+
58
+ # Print output shapes
59
+ # 7k human tracks over 37.5 % center region of the input sequence
60
+ print("bigwig_tracks_logits:", tuple(out.bigwig_tracks_logits.shape))
61
+ # Location of 21 genomic elements over 37.5 % center region of the input sequence
62
+ print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape))
63
+ # Language model logits for whole sequence over vocabulary
64
+ print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div>
65
+ <p style="margin-top: 15px;">To run the interactive Gradio app locally:</p>
66
+ <div class="code"><pre><code class="language-bash">pip install -r requirements.txt
67
+ python app.py</code></pre></div>
68
+ </div>
69
+ </div>
70
+
71
+ <script>
72
+ // Try to detect if Gradio app is available
73
+ window.addEventListener('load', function() {
74
+ const iframe = document.getElementById('gradio-iframe');
75
+ iframe.onerror = function() {
76
+ // If iframe fails to load, keep showing the instructions
77
+ document.getElementById('gradio-loading').style.display = 'block';
78
+ iframe.style.display = 'none';
79
+ };
80
+ // Set a timeout to show instructions if iframe doesn't load
81
+ setTimeout(function() {
82
+ if (iframe.style.display === 'none') {
83
+ document.getElementById('gradio-loading').style.display = 'block';
84
+ }
85
+ }, 2000);
86
+ });
87
+ </script>
88
+
tabs/home.html ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="summary">
2
+ <h2>📖 About NTv3</h2>
3
+ <p>
4
+ NTv3 is a multi-species genomic foundation model family that unifies representation learning, functional-track prediction, genome annotation, and controllable sequence generation within a single U-Net-style backbone. It models up to 1 Mb of DNA at single-base resolution, using a conv–Transformer–deconv architecture that efficiently captures both local motifs and long-range regulatory dependencies. NTv3 is first pretrained on ~9T base pairs from the OpenGenome2 corpus spanning >128k species using masked language modeling, and then post-trained with a joint objective on ~16k functional tracks and annotation labels across 24 animal and plant species, enabling state-of-the-art cross-species functional prediction and base-resolution genome annotation.
5
+ </p>
6
+ <p>
7
+ Beyond prediction, NTv3 can be fine-tuned into a controllable generative model via masked-diffusion language modeling, allowing targeted design of regulatory sequences (for example, enhancers with specified activity and promoter selectivity) that have been validated experimentally.
8
+ </p>
9
+ </div>
10
+
11
+ <div class="paper-summary">
12
+ <!-- <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2> -->
13
+ <img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
14
+ </div>
15
+
16
+ <div class="why-ntv3">
17
+ <h2>✨ Why NTv3?</h2>
18
+ <ul>
19
+ <li>📏 <strong>1 Mb long context at nucleotide resolution</strong> — ~100× longer than typical genomics models.</li>
20
+ <li>🏗️ <strong>Unified architecture</strong> for: masked language modeling, functional-track prediction, genome annotation, and sequence generation.</li>
21
+ <li>🌍 <strong>Cross-species generalization</strong> across 24 animals + plants with a shared conditioned representation space.</li>
22
+ <li>⚡ <strong>U-Net–style architecture</strong> improves stability and GPU efficiency on very long sequences.</li>
23
+ <li>🎯 <strong>Controllable generative modeling</strong>, enabling targeted enhancer/promoter engineering validated by experimental assays.</li>
24
+ </ul>
25
+ </div>
26
+
27
+ <div class="grid">
28
+ <div class="card">
29
+ <h2>🤖 Models (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>)</h2>
30
+ <ul>
31
+ <li>📦 Pretrained checkpoints:
32
+ <div style="margin-top: 8px; margin-left: 0;">
33
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_8M_pre"><code>InstaDeepAI/NTv3_8M_pre</code></a></div>
34
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pre"><code>InstaDeepAI/NTv3_100M_pre</code></a></div>
35
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pre"><code>InstaDeepAI/NTv3_650M_pre</code></a></div>
36
+ </div>
37
+ </li>
38
+ <li>🎯 Post-trained checkpoints:
39
+ <div style="margin-top: 8px; margin-left: 0;">
40
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pos"><code>InstaDeepAI/NTv3_100M_pos</code></a></div>
41
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pos"><code>InstaDeepAI/NTv3_650M_pos</code></a></div>
42
+ </div>
43
+ </li>
44
+ </ul>
45
+ <table>
46
+ <thead>
47
+ <tr>
48
+ <th>Model</th>
49
+ <th>Size</th>
50
+ <th>Pre-training</th>
51
+ <th>Post-training</th>
52
+ <th>Tasks</th>
53
+ </tr>
54
+ </thead>
55
+ <tbody>
56
+ <tr>
57
+ <td><strong>NTv3-8M</strong></td>
58
+ <td>8M params</td>
59
+ <td>MLM</td>
60
+ <td>❌</td>
61
+ <td>Embeddings, light inference</td>
62
+ </tr>
63
+ <tr>
64
+ <td><strong>NTv3-100M</strong></td>
65
+ <td>100M params</td>
66
+ <td>MLM</td>
67
+ <td><span class="checkmark">✅</span></td>
68
+ <td>Tracks, annotation</td>
69
+ </tr>
70
+ <tr>
71
+ <td><strong>NTv3-650M</strong></td>
72
+ <td>650M params</td>
73
+ <td>MLM</td>
74
+ <td><span class="checkmark">✅</span></td>
75
+ <td>Tracks, annotation, best accuracy</td>
76
+ </tr>
77
+ </tbody>
78
+ </table>
79
+ </div>
80
+
81
+ <div class="card-stack">
82
+ <div class="card">
83
+ <h2>📓 Tutorial notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_tutorials" target="_blank" rel="noopener">folder</a>)</h2>
84
+ <ul>
85
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/00_quickstart_inference.ipynb" target="_blank" rel="noopener">🚀 00 — Quickstart inference</a></li>
86
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb" target="_blank" rel="noopener">📊 01 — Tracks prediction</a></li>
87
+ <li>🎯 02 — Fine-tune on bigwig tracks</li>
88
+ <li>🔍 03 — Model interpretation</li>
89
+ <li>🧪 04 — Training NTv3 generative </li>
90
+ </ul>
91
+ </div>
92
+ <div class="card">
93
+ <h2>📓 Pipelines notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_pipelines" target="_blank" rel="noopener">folder</a>)</h2>
94
+ <ul>
95
+ <li> 🎯 01 — Generate bigwig predictions for certain tracks</li>
96
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/02_genome_annotation.ipynb" target="_blank" rel="noopener">🏷️ 02 — Genome annotation / segmentation</a></li>
97
+ <li>🎯 03 — Fine-tune on bigwig tracks</li>
98
+ <li>🔍 04 — Interpret a given genomic region</li>
99
+ <li>🧪 05 — Sequence generation</li>
100
+ </ul>
101
+ </div>
102
+ <div class="card">
103
+ <h2>🔗 Links</h2>
104
+ <ul>
105
+ <li>📄 Paper: (add link)</li>
106
+ <li><a href="https://github.com/instadeepai/nucleotide-transformer">💻 JAX model code (GitHub)</a></li>
107
+ <li><a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">🎯 HF Model Collection (all NTv3 models)</a></li>
108
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">📓 All notebooks</a></li>
109
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3_benchmark" target="_blank" rel="noopener">🏆 NTv3 benchmark leaderboard</a></li>
110
+ </ul>
111
+ </div>
112
+ </div>
113
+
114
+ <div class="card">
115
+ <h2>🤖 Load a pre-trained model</h2>
116
+ <p>Here is an example of how to load and use a pre-trained NTv3 model.</p>
117
+ <div class="code"><pre><code class="language-python">from transformers import AutoTokenizer, AutoModelForMaskedLM
118
+
119
+ model_name = "InstaDeepAI/NTv3_650M_pre"
120
+
121
+ # Load model and tokenizer
122
+ model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)
123
+ tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
124
+
125
+ # Tokenize input sequences
126
+ batch = tok(["ATCGNATCG", "ACGT"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")
127
+
128
+ # Run model
129
+ out = model(
130
+ **batch,
131
+ output_hidden_states=True,
132
+ output_attentions=True
133
+ )
134
+
135
+ # Print output shapes
136
+ print(out.logits.shape) # (B, L, V = 11)
137
+ print(len(out.hidden_states)) # convs + transformers + deconvs
138
+ print(len(out.attentions)) # equals transformer layers = 12
139
+ </code></pre></div>
140
+ <p>Model embeddings can be used for fine-tuning on downstream tasks.</p>
141
+
142
+ <p style="margin-top: 40px;">TO DO: add pipeline for fine-tuning on functional tracks or genome annotation.</p>
143
+ </div>
144
+
145
+ <div class="card">
146
+ <h2>💻 Use a post-trained model</h2>
147
+ <p>Here is a quick example of how to use the post-trained NTv3 650M model to predict tracks for a human genomic window.</p>
148
+ <div class="code"><pre><code class="language-python">from transformers import pipeline
149
+ import torch
150
+
151
+ model_name = "InstaDeepAI/NTv3_650M_pos"
152
+
153
+ ntv3_tracks = pipeline(
154
+ "ntv3-tracks",
155
+ model=model_name,
156
+ trust_remote_code=True,
157
+ device=0 if torch.cuda.is_available() else -1,
158
+ )
159
+
160
+ # Run track prediction
161
+ out = ntv3_tracks(
162
+ {
163
+ "chrom": "chr19",
164
+ "start": 6_700_000,
165
+ "end": 6_831_072,
166
+ "species": "human"
167
+ }
168
+ )
169
+
170
+ # Print output shapes
171
+ # 7k human tracks over 37.5 % center region of the input sequence
172
+ print("bigwig_tracks_logits:", tuple(out.bigwig_tracks_logits.shape))
173
+ # Location of 21 genomic elements over 37.5 % center region of the input sequence
174
+ print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape))
175
+ # Language model logits for whole sequence over vocabulary
176
+ print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div>
177
+ <p>Predictions can also be plotted for a subset of functional tracks and genomic elements:</p>
178
+ <div class="code"><pre><code class="language-python">tracks_to_plot = {
179
+ "K562 RNA-seq": "ENCSR056HPM",
180
+ "K562 DNAse": "ENCSR921NMD",
181
+ "K562 H3k4me3": "ENCSR000DWD",
182
+ "K562 CTCF": "ENCSR000AKO",
183
+ "HepG2 RNA-seq": "ENCSR561FEE_P",
184
+ "HepG2 DNAse": "ENCSR000EJV",
185
+ "HepG2 H3k4me3": "ENCSR000AMP",
186
+ "HepG2 CTCF": "ENCSR000BIE",
187
+ }
188
+ elements_to_plot = ["protein_coding_gene", "exon", "intron", "splice_donor", "splice_acceptor"]
189
+
190
+ out = ntv3_tracks(
191
+ {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072, "species": "human"},
192
+ plot=True,
193
+ tracks_to_plot=tracks_to_plot,
194
+ elements_to_plot=elements_to_plot,
195
+ )</code></pre></div>
196
+ <img src="assets/output_tracks.png" alt="Output tracks visualization" style="max-width: 100%; margin-top: 20px;" />
197
+ </div>
198
+ </div>
199
+