Spaces:
Running
Running
Commit
·
42f0385
1
Parent(s):
680e6ef
feat: add new tab
Browse files- README.md +1 -1
- app.py +11 -0
- app_tracks.py +158 -0
- index.html +148 -192
- notebooks_pipelines/02_genome_annotation.ipynb +14 -127
- notebooks_pipelines/NTv3_650M_pos_hg38_chr19_6700000_6831072.gff3 +107 -0
- requirements.txt +7 -0
- tabs/demo.html +88 -0
- tabs/home.html +199 -0
README.md
CHANGED
|
@@ -3,7 +3,7 @@ title: NTv3 — Foundation Models for Long-Range Genomics
|
|
| 3 |
emoji: 🧬
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: blue
|
| 6 |
-
sdk:
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
|
|
|
| 3 |
emoji: 🧬
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
app.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main Gradio app entry point for NTv3 Space.
|
| 3 |
+
This file imports the track prediction demo from app_tracks.py.
|
| 4 |
+
"""
|
| 5 |
+
from app_tracks import demo_interface
|
| 6 |
+
|
| 7 |
+
# For Hugging Face Spaces with Gradio SDK, the 'demo' variable must be named 'demo'
|
| 8 |
+
demo = demo_interface
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
demo.launch(server_name="0.0.0.0", share=False)
|
app_tracks.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Gradio app for NTv3 track prediction demo.
|
| 3 |
+
This module contains the interactive track prediction interface.
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import pipeline
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Initialize the pipeline (will be loaded on first use)
|
| 11 |
+
ntv3_tracks = None
|
| 12 |
+
|
| 13 |
+
def load_pipeline():
|
| 14 |
+
"""Load the pipeline on first use (lazy loading)."""
|
| 15 |
+
global ntv3_tracks
|
| 16 |
+
if ntv3_tracks is None:
|
| 17 |
+
model_name = "InstaDeepAI/NTv3_650M_pos"
|
| 18 |
+
ntv3_tracks = pipeline(
|
| 19 |
+
"ntv3-tracks",
|
| 20 |
+
model=model_name,
|
| 21 |
+
trust_remote_code=True,
|
| 22 |
+
device=0 if torch.cuda.is_available() else -1,
|
| 23 |
+
)
|
| 24 |
+
return ntv3_tracks
|
| 25 |
+
|
| 26 |
+
def predict_tracks(chrom, start, end, species):
|
| 27 |
+
"""Run track prediction on the specified genomic region."""
|
| 28 |
+
try:
|
| 29 |
+
# Validate inputs
|
| 30 |
+
if not chrom or not start or not end or not species:
|
| 31 |
+
return "❌ Please fill in all fields."
|
| 32 |
+
|
| 33 |
+
start = int(start)
|
| 34 |
+
end = int(end)
|
| 35 |
+
|
| 36 |
+
if start >= end:
|
| 37 |
+
return "❌ Start position must be less than end position."
|
| 38 |
+
|
| 39 |
+
if end - start > 1_000_000:
|
| 40 |
+
return "❌ Region size cannot exceed 1 Mb (1,000,000 bp)."
|
| 41 |
+
|
| 42 |
+
# Load pipeline
|
| 43 |
+
pipe = load_pipeline()
|
| 44 |
+
|
| 45 |
+
# Run prediction
|
| 46 |
+
out = pipe({
|
| 47 |
+
"chrom": chrom,
|
| 48 |
+
"start": start,
|
| 49 |
+
"end": end,
|
| 50 |
+
"species": species.lower()
|
| 51 |
+
})
|
| 52 |
+
|
| 53 |
+
# Format output
|
| 54 |
+
result = f"""✅ Prediction completed successfully!
|
| 55 |
+
|
| 56 |
+
📊 Output Shapes:
|
| 57 |
+
• BigWig tracks logits: {tuple(out.bigwig_tracks_logits.shape)}
|
| 58 |
+
→ {out.bigwig_tracks_logits.shape[1]} functional tracks over the center region
|
| 59 |
+
|
| 60 |
+
• BED tracks logits: {tuple(out.bed_tracks_logits.shape)}
|
| 61 |
+
→ {out.bed_tracks_logits.shape[1]} genomic elements over the center region
|
| 62 |
+
|
| 63 |
+
• Language model logits: {tuple(out.mlm_logits.shape)}
|
| 64 |
+
→ MLM predictions for the entire sequence
|
| 65 |
+
|
| 66 |
+
📝 Note: Predictions are made over 37.5% of the center region of the input sequence.
|
| 67 |
+
"""
|
| 68 |
+
return result
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
return f"❌ Error: {str(e)}"
|
| 72 |
+
|
| 73 |
+
# Create the track prediction demo interface (embedded in HTML)
|
| 74 |
+
def create_demo_interface():
|
| 75 |
+
"""Create the Gradio interface for track prediction."""
|
| 76 |
+
with gr.Blocks(title="NTv3 Track Prediction Demo", theme=gr.themes.Soft()) as demo_interface:
|
| 77 |
+
gr.Markdown("""
|
| 78 |
+
# 🧬 NTv3 Interactive Track Prediction Demo
|
| 79 |
+
|
| 80 |
+
This demo allows you to run the NTv3 650M post-trained model to predict functional tracks and genomic elements for any genomic region.
|
| 81 |
+
|
| 82 |
+
**Model:** `InstaDeepAI/NTv3_650M_pos`
|
| 83 |
+
""")
|
| 84 |
+
|
| 85 |
+
with gr.Row():
|
| 86 |
+
with gr.Column():
|
| 87 |
+
chrom = gr.Textbox(
|
| 88 |
+
label="Chromosome",
|
| 89 |
+
placeholder="e.g., chr19",
|
| 90 |
+
value="chr19",
|
| 91 |
+
info="Chromosome name (e.g., chr1, chr19)"
|
| 92 |
+
)
|
| 93 |
+
start = gr.Number(
|
| 94 |
+
label="Start Position",
|
| 95 |
+
placeholder="e.g., 6700000",
|
| 96 |
+
value=6_700_000,
|
| 97 |
+
info="Start position in base pairs"
|
| 98 |
+
)
|
| 99 |
+
end = gr.Number(
|
| 100 |
+
label="End Position",
|
| 101 |
+
placeholder="e.g., 6831072",
|
| 102 |
+
value=6_831_072,
|
| 103 |
+
info="End position in base pairs"
|
| 104 |
+
)
|
| 105 |
+
species = gr.Dropdown(
|
| 106 |
+
label="Species",
|
| 107 |
+
choices=[
|
| 108 |
+
"human", "mouse", "rat", "chicken", "zebrafish",
|
| 109 |
+
"fruitfly", "worm", "yeast", "arabidopsis", "rice",
|
| 110 |
+
"maize", "soybean", "tomato", "potato", "grape",
|
| 111 |
+
"poplar", "medicago", "lotus", "brachypodium", "sorghum",
|
| 112 |
+
"barley", "wheat", "oats", "rye"
|
| 113 |
+
],
|
| 114 |
+
value="human",
|
| 115 |
+
info="Select the species (24 supported species)"
|
| 116 |
+
)
|
| 117 |
+
predict_btn = gr.Button("🚀 Run Prediction", variant="primary")
|
| 118 |
+
|
| 119 |
+
with gr.Column():
|
| 120 |
+
output = gr.Textbox(
|
| 121 |
+
label="Results",
|
| 122 |
+
lines=15,
|
| 123 |
+
interactive=False,
|
| 124 |
+
placeholder="Results will appear here after running prediction..."
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
gr.Markdown("""
|
| 128 |
+
### 📝 Notes:
|
| 129 |
+
- The model predicts ~7k functional tracks and 21 genomic elements
|
| 130 |
+
- Predictions are made over 37.5% of the center region of the input sequence
|
| 131 |
+
- Maximum region size: 1 Mb (1,000,000 base pairs)
|
| 132 |
+
- First run may take longer as the model loads
|
| 133 |
+
""")
|
| 134 |
+
|
| 135 |
+
predict_btn.click(
|
| 136 |
+
fn=predict_tracks,
|
| 137 |
+
inputs=[chrom, start, end, species],
|
| 138 |
+
outputs=output
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
gr.Examples(
|
| 142 |
+
examples=[
|
| 143 |
+
["chr19", 6_700_000, 6_831_072, "human"],
|
| 144 |
+
["chr1", 100_000, 200_000, "human"],
|
| 145 |
+
["chr2", 50_000, 150_000, "mouse"],
|
| 146 |
+
],
|
| 147 |
+
inputs=[chrom, start, end, species]
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
return demo_interface
|
| 151 |
+
|
| 152 |
+
# Create the demo interface
|
| 153 |
+
demo_interface = create_demo_interface()
|
| 154 |
+
|
| 155 |
+
# If running this file directly (for local testing)
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
+
demo_interface.launch(server_name="0.0.0.0", share=False)
|
| 158 |
+
|
index.html
CHANGED
|
@@ -199,9 +199,56 @@
|
|
| 199 |
border-radius: 12px;
|
| 200 |
}
|
| 201 |
.footer { margin-top: 22px; color: var(--muted); font-size: 13px; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
@media (max-width: 860px) {
|
| 203 |
.card { grid-column: span 12; }
|
| 204 |
h1 { font-size: 28px; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
}
|
| 206 |
</style>
|
| 207 |
</head>
|
|
@@ -223,215 +270,124 @@
|
|
| 223 |
</div>
|
| 224 |
</div>
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
<
|
| 229 |
-
|
| 230 |
-
</p>
|
| 231 |
-
<p>
|
| 232 |
-
Beyond prediction, NTv3 can be fine-tuned into a controllable generative model via masked-diffusion language modeling, allowing targeted design of regulatory sequences (for example, enhancers with specified activity and promoter selectivity) that have been validated experimentally.
|
| 233 |
-
</p>
|
| 234 |
</div>
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
</div>
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
<li>📏 <strong>1 Mb long context at nucleotide resolution</strong> — ~100× longer than typical genomics models.</li>
|
| 245 |
-
<li>🏗️ <strong>Unified architecture</strong> for: masked language modeling, functional-track prediction, genome annotation, and sequence generation.</li>
|
| 246 |
-
<li>🌍 <strong>Cross-species generalization</strong> across 24 animals + plants with a shared conditioned representation space.</li>
|
| 247 |
-
<li>⚡ <strong>U-Net–style architecture</strong> improves stability and GPU efficiency on very long sequences.</li>
|
| 248 |
-
<li>🎯 <strong>Controllable generative modeling</strong>, enabling targeted enhancer/promoter engineering validated by experimental assays.</li>
|
| 249 |
-
</ul>
|
| 250 |
-
</div>
|
| 251 |
-
|
| 252 |
-
<div class="grid">
|
| 253 |
-
<div class="card">
|
| 254 |
-
<h2>🤖 Models (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>)</h2>
|
| 255 |
-
<ul>
|
| 256 |
-
<li>📦 Pretrained checkpoints:
|
| 257 |
-
<div style="margin-top: 8px; margin-left: 0;">
|
| 258 |
-
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_8M_pre"><code>InstaDeepAI/NTv3_8M_pre</code></a></div>
|
| 259 |
-
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pre"><code>InstaDeepAI/NTv3_100M_pre</code></a></div>
|
| 260 |
-
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pre"><code>InstaDeepAI/NTv3_650M_pre</code></a></div>
|
| 261 |
-
</div>
|
| 262 |
-
</li>
|
| 263 |
-
<li>🎯 Post-trained checkpoints:
|
| 264 |
-
<div style="margin-top: 8px; margin-left: 0;">
|
| 265 |
-
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pos"><code>InstaDeepAI/NTv3_100M_pos</code></a></div>
|
| 266 |
-
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pos"><code>InstaDeepAI/NTv3_650M_pos</code></a></div>
|
| 267 |
-
</div>
|
| 268 |
-
</li>
|
| 269 |
-
</ul>
|
| 270 |
-
<table>
|
| 271 |
-
<thead>
|
| 272 |
-
<tr>
|
| 273 |
-
<th>Model</th>
|
| 274 |
-
<th>Size</th>
|
| 275 |
-
<th>Pre-training</th>
|
| 276 |
-
<th>Post-training</th>
|
| 277 |
-
<th>Tasks</th>
|
| 278 |
-
</tr>
|
| 279 |
-
</thead>
|
| 280 |
-
<tbody>
|
| 281 |
-
<tr>
|
| 282 |
-
<td><strong>NTv3-8M</strong></td>
|
| 283 |
-
<td>8M params</td>
|
| 284 |
-
<td>MLM</td>
|
| 285 |
-
<td>❌</td>
|
| 286 |
-
<td>Embeddings, light inference</td>
|
| 287 |
-
</tr>
|
| 288 |
-
<tr>
|
| 289 |
-
<td><strong>NTv3-100M</strong></td>
|
| 290 |
-
<td>100M params</td>
|
| 291 |
-
<td>MLM</td>
|
| 292 |
-
<td><span class="checkmark">✅</span></td>
|
| 293 |
-
<td>Tracks, annotation</td>
|
| 294 |
-
</tr>
|
| 295 |
-
<tr>
|
| 296 |
-
<td><strong>NTv3-650M</strong></td>
|
| 297 |
-
<td>650M params</td>
|
| 298 |
-
<td>MLM</td>
|
| 299 |
-
<td><span class="checkmark">✅</span></td>
|
| 300 |
-
<td>Tracks, annotation, best accuracy</td>
|
| 301 |
-
</tr>
|
| 302 |
-
</tbody>
|
| 303 |
-
</table>
|
| 304 |
-
</div>
|
| 305 |
-
|
| 306 |
-
<div class="card-stack">
|
| 307 |
-
<div class="card">
|
| 308 |
-
<h2>📓 Tutorial notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_tutorials" target="_blank" rel="noopener">folder</a>)</h2>
|
| 309 |
-
<ul>
|
| 310 |
-
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/00_quickstart_inference.ipynb" target="_blank" rel="noopener">🚀 00 — Quickstart inference</a></li>
|
| 311 |
-
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb" target="_blank" rel="noopener">📊 01 — Tracks prediction</a></li>
|
| 312 |
-
<li>🎯 02 — Fine-tune on bigwig tracks</li>
|
| 313 |
-
<li>🔍 03 — Model interpretation</li>
|
| 314 |
-
<li>🧪 04 — Training NTv3 generative </li>
|
| 315 |
-
</ul>
|
| 316 |
</div>
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
<li>🎯 03 — Fine-tune on bigwig tracks</li>
|
| 323 |
-
<li>🔍 04 — Interpret a given genomic region</li>
|
| 324 |
-
<li>🧪 05 — Sequence generation</li>
|
| 325 |
-
</ul>
|
| 326 |
-
</div>
|
| 327 |
-
<div class="card">
|
| 328 |
-
<h2>🔗 Links</h2>
|
| 329 |
-
<ul>
|
| 330 |
-
<li>📄 Paper: (add link)</li>
|
| 331 |
-
<li><a href="https://github.com/instadeepai/nucleotide-transformer">💻 JAX model code (GitHub)</a></li>
|
| 332 |
-
<li><a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">🎯 HF Model Collection (all NTv3 models)</a></li>
|
| 333 |
-
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">📓 All notebooks</a></li>
|
| 334 |
-
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3_benchmark" target="_blank" rel="noopener">🏆 NTv3 benchmark leaderboard</a></li>
|
| 335 |
-
</ul>
|
| 336 |
-
</div>
|
| 337 |
-
</div>
|
| 338 |
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
-
|
|
|
|
| 345 |
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
-
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
|
|
|
| 366 |
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
)
|
| 384 |
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
"chrom": "chr19",
|
| 389 |
-
"start": 6_700_000,
|
| 390 |
-
"end": 6_831_072,
|
| 391 |
-
"species": "human"
|
| 392 |
-
}
|
| 393 |
-
)
|
| 394 |
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
# Location of 21 genomic elements over 37.5 % center region of the input sequence
|
| 399 |
-
print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape))
|
| 400 |
-
# Language model logits for whole sequence over vocabulary
|
| 401 |
-
print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div>
|
| 402 |
-
<p>Predictions can also be plotted for a subset of functional tracks and genomic elements:</p>
|
| 403 |
-
<div class="code"><pre><code class="language-python">tracks_to_plot = {
|
| 404 |
-
"K562 RNA-seq": "ENCSR056HPM",
|
| 405 |
-
"K562 DNAse": "ENCSR921NMD",
|
| 406 |
-
"K562 H3k4me3": "ENCSR000DWD",
|
| 407 |
-
"K562 CTCF": "ENCSR000AKO",
|
| 408 |
-
"HepG2 RNA-seq": "ENCSR561FEE_P",
|
| 409 |
-
"HepG2 DNAse": "ENCSR000EJV",
|
| 410 |
-
"HepG2 H3k4me3": "ENCSR000AMP",
|
| 411 |
-
"HepG2 CTCF": "ENCSR000BIE",
|
| 412 |
-
}
|
| 413 |
-
elements_to_plot = ["protein_coding_gene", "exon", "intron", "splice_donor", "splice_acceptor"]
|
| 414 |
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
elements_to_plot=elements_to_plot,
|
| 420 |
-
)</code></pre></div>
|
| 421 |
-
<img src="assets/output_tracks.png" alt="Output tracks visualization" style="max-width: 100%; margin-top: 20px;" />
|
| 422 |
-
</div>
|
| 423 |
-
</div>
|
| 424 |
-
|
| 425 |
-
<!-- <div class="paper-summary">
|
| 426 |
-
<h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2>
|
| 427 |
-
<img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
|
| 428 |
-
</div> -->
|
| 429 |
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
</body>
|
| 437 |
</html>
|
|
|
|
| 199 |
border-radius: 12px;
|
| 200 |
}
|
| 201 |
.footer { margin-top: 22px; color: var(--muted); font-size: 13px; }
|
| 202 |
+
|
| 203 |
+
/* Tab navigation styles */
|
| 204 |
+
.tabs {
|
| 205 |
+
margin-top: 24px;
|
| 206 |
+
display: flex;
|
| 207 |
+
gap: 8px;
|
| 208 |
+
border-bottom: 2px solid var(--border);
|
| 209 |
+
overflow-x: auto;
|
| 210 |
+
}
|
| 211 |
+
.tab-button {
|
| 212 |
+
padding: 12px 20px;
|
| 213 |
+
background: transparent;
|
| 214 |
+
border: none;
|
| 215 |
+
border-bottom: 2px solid transparent;
|
| 216 |
+
color: var(--muted);
|
| 217 |
+
font-family: var(--sans);
|
| 218 |
+
font-size: 14px;
|
| 219 |
+
font-weight: 500;
|
| 220 |
+
cursor: pointer;
|
| 221 |
+
transition: all 0.2s ease;
|
| 222 |
+
white-space: nowrap;
|
| 223 |
+
margin-bottom: -2px;
|
| 224 |
+
}
|
| 225 |
+
.tab-button:hover {
|
| 226 |
+
color: var(--text);
|
| 227 |
+
background: rgba(255, 255, 255, 0.03);
|
| 228 |
+
}
|
| 229 |
+
.tab-button.active {
|
| 230 |
+
color: var(--link);
|
| 231 |
+
border-bottom-color: var(--link);
|
| 232 |
+
}
|
| 233 |
+
.tab-content {
|
| 234 |
+
display: none;
|
| 235 |
+
animation: fadeIn 0.3s ease;
|
| 236 |
+
}
|
| 237 |
+
.tab-content.active {
|
| 238 |
+
display: block;
|
| 239 |
+
}
|
| 240 |
+
@keyframes fadeIn {
|
| 241 |
+
from { opacity: 0; transform: translateY(8px); }
|
| 242 |
+
to { opacity: 1; transform: translateY(0); }
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
@media (max-width: 860px) {
|
| 246 |
.card { grid-column: span 12; }
|
| 247 |
h1 { font-size: 28px; }
|
| 248 |
+
.tab-button {
|
| 249 |
+
padding: 10px 16px;
|
| 250 |
+
font-size: 13px;
|
| 251 |
+
}
|
| 252 |
}
|
| 253 |
</style>
|
| 254 |
</head>
|
|
|
|
| 270 |
</div>
|
| 271 |
</div>
|
| 272 |
|
| 273 |
+
<!-- Tab Navigation -->
|
| 274 |
+
<div class="tabs">
|
| 275 |
+
<button class="tab-button active" data-tab="home">🏠 Home</button>
|
| 276 |
+
<button class="tab-button" data-tab="demo">💻 Code Demo</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
</div>
|
| 278 |
|
| 279 |
+
<!-- Home Tab (Content loaded from tabs/home.html) -->
|
| 280 |
+
<div id="home" class="tab-content active">
|
| 281 |
+
<!-- Content will be loaded dynamically -->
|
| 282 |
</div>
|
| 283 |
|
| 284 |
+
<!-- Code Demo Tab (Content loaded from tabs/demo.html) -->
|
| 285 |
+
<div id="demo" class="tab-content">
|
| 286 |
+
<!-- Content will be loaded dynamically -->
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
</div>
|
| 288 |
+
|
| 289 |
+
<!-- <div class="paper-summary">
|
| 290 |
+
<h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2>
|
| 291 |
+
<img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
|
| 292 |
+
</div> -->
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
+
<p class="footer">
|
| 295 |
+
© instadeep-ai — NTv3 companion Space.
|
| 296 |
+
</p>
|
| 297 |
+
</div>
|
| 298 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-core.min.js"></script>
|
| 299 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/autoloader/prism-autoloader.min.js"></script>
|
| 300 |
+
<script>
|
| 301 |
+
// Tab content mapping
|
| 302 |
+
const tabFiles = {
|
| 303 |
+
'home': 'tabs/home.html',
|
| 304 |
+
'demo': 'tabs/demo.html'
|
| 305 |
+
};
|
| 306 |
|
| 307 |
+
// Cache for loaded tab content
|
| 308 |
+
const tabCache = {};
|
| 309 |
|
| 310 |
+
// Function to load tab content
|
| 311 |
+
async function loadTabContent(tabId) {
|
| 312 |
+
// Return cached content if available
|
| 313 |
+
if (tabCache[tabId]) {
|
| 314 |
+
return tabCache[tabId];
|
| 315 |
+
}
|
| 316 |
|
| 317 |
+
// Load content from file
|
| 318 |
+
const filePath = tabFiles[tabId];
|
| 319 |
+
if (!filePath) {
|
| 320 |
+
console.error(`No file path defined for tab: ${tabId}`);
|
| 321 |
+
return '';
|
| 322 |
+
}
|
| 323 |
|
| 324 |
+
try {
|
| 325 |
+
const response = await fetch(filePath);
|
| 326 |
+
if (!response.ok) {
|
| 327 |
+
throw new Error(`Failed to load ${filePath}: ${response.statusText}`);
|
| 328 |
+
}
|
| 329 |
+
const content = await response.text();
|
| 330 |
+
tabCache[tabId] = content;
|
| 331 |
+
return content;
|
| 332 |
+
} catch (error) {
|
| 333 |
+
console.error(`Error loading tab content for ${tabId}:`, error);
|
| 334 |
+
return `<div class="summary"><p>Error loading content. Please refresh the page.</p></div>`;
|
| 335 |
+
}
|
| 336 |
+
}
|
| 337 |
|
| 338 |
+
// Function to show a tab
|
| 339 |
+
async function showTab(tabId) {
|
| 340 |
+
const tabContent = document.getElementById(tabId);
|
| 341 |
+
if (!tabContent) {
|
| 342 |
+
console.error(`Tab element not found: ${tabId}`);
|
| 343 |
+
return;
|
| 344 |
+
}
|
| 345 |
|
| 346 |
+
// Load content if not already loaded
|
| 347 |
+
if (!tabContent.dataset.loaded) {
|
| 348 |
+
tabContent.innerHTML = await loadTabContent(tabId);
|
| 349 |
+
tabContent.dataset.loaded = 'true';
|
| 350 |
+
|
| 351 |
+
// Re-run Prism.js syntax highlighting for code blocks in the loaded content
|
| 352 |
+
if (typeof Prism !== 'undefined') {
|
| 353 |
+
// Find all code blocks in the loaded content and highlight them
|
| 354 |
+
const codeBlocks = tabContent.querySelectorAll('code[class*="language-"]');
|
| 355 |
+
codeBlocks.forEach(block => {
|
| 356 |
+
Prism.highlightElement(block);
|
| 357 |
+
});
|
| 358 |
+
}
|
| 359 |
+
}
|
| 360 |
+
}
|
| 361 |
|
| 362 |
+
// Tab switching functionality
|
| 363 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 364 |
+
const tabButtons = document.querySelectorAll('.tab-button');
|
| 365 |
+
const tabContents = document.querySelectorAll('.tab-content');
|
| 366 |
|
| 367 |
+
// Load the default active tab (home)
|
| 368 |
+
const activeTab = document.querySelector('.tab-content.active');
|
| 369 |
+
if (activeTab) {
|
| 370 |
+
showTab(activeTab.id);
|
| 371 |
+
}
|
|
|
|
| 372 |
|
| 373 |
+
tabButtons.forEach(button => {
|
| 374 |
+
button.addEventListener('click', async () => {
|
| 375 |
+
const targetTab = button.getAttribute('data-tab');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
+
// Remove active class from all buttons and contents
|
| 378 |
+
tabButtons.forEach(btn => btn.classList.remove('active'));
|
| 379 |
+
tabContents.forEach(content => content.classList.remove('active'));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
+
// Add active class to clicked button and corresponding content
|
| 382 |
+
button.classList.add('active');
|
| 383 |
+
const tabElement = document.getElementById(targetTab);
|
| 384 |
+
tabElement.classList.add('active');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
+
// Load and show the tab content
|
| 387 |
+
await showTab(targetTab);
|
| 388 |
+
});
|
| 389 |
+
});
|
| 390 |
+
});
|
| 391 |
+
</script>
|
| 392 |
</body>
|
| 393 |
</html>
|
notebooks_pipelines/02_genome_annotation.ipynb
CHANGED
|
@@ -29,16 +29,7 @@
|
|
| 29 |
"execution_count": 1,
|
| 30 |
"id": "2e2f5963",
|
| 31 |
"metadata": {},
|
| 32 |
-
"outputs": [
|
| 33 |
-
{
|
| 34 |
-
"name": "stdout",
|
| 35 |
-
"output_type": "stream",
|
| 36 |
-
"text": [
|
| 37 |
-
"\u001b[33mWARNING: 401 Error, Credentials not correct for https://gitlab.com/api/v4/projects/36813343/packages/pypi/simple/igv-notebook/\u001b[0m\u001b[33m\n",
|
| 38 |
-
"\u001b[0m"
|
| 39 |
-
]
|
| 40 |
-
}
|
| 41 |
-
],
|
| 42 |
"source": [
|
| 43 |
"# Install dependencies\n",
|
| 44 |
"!pip -q install \"transformers>=4.55\" \"huggingface_hub>=0.23\" safetensors torch pyfaidx requests seaborn matplotlib igv_notebook"
|
|
@@ -127,28 +118,14 @@
|
|
| 127 |
},
|
| 128 |
{
|
| 129 |
"cell_type": "code",
|
| 130 |
-
"execution_count":
|
| 131 |
"id": "4857d15c",
|
| 132 |
"metadata": {},
|
| 133 |
"outputs": [
|
| 134 |
{
|
| 135 |
"data": {
|
| 136 |
"application/vnd.jupyter.widget-view+json": {
|
| 137 |
-
"model_id": "
|
| 138 |
-
"version_major": 2,
|
| 139 |
-
"version_minor": 0
|
| 140 |
-
},
|
| 141 |
-
"text/plain": [
|
| 142 |
-
"config.json: 0%| | 0.00/338k [00:00<?, ?B/s]"
|
| 143 |
-
]
|
| 144 |
-
},
|
| 145 |
-
"metadata": {},
|
| 146 |
-
"output_type": "display_data"
|
| 147 |
-
},
|
| 148 |
-
{
|
| 149 |
-
"data": {
|
| 150 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 151 |
-
"model_id": "2468d781d0b7409791c5079ee9860a81",
|
| 152 |
"version_major": 2,
|
| 153 |
"version_minor": 0
|
| 154 |
},
|
|
@@ -165,105 +142,15 @@
|
|
| 165 |
"text": [
|
| 166 |
"A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/NTv3_650M_pos:\n",
|
| 167 |
"- ntv3_gff_pipeline.py\n",
|
| 168 |
-
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
|
| 169 |
-
]
|
| 170 |
-
},
|
| 171 |
-
{
|
| 172 |
-
"data": {
|
| 173 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 174 |
-
"model_id": "fabadaa764ba4da799c0d43b12ac42b1",
|
| 175 |
-
"version_major": 2,
|
| 176 |
-
"version_minor": 0
|
| 177 |
-
},
|
| 178 |
-
"text/plain": [
|
| 179 |
-
"model.safetensors: 0%| | 0.00/2.72G [00:00<?, ?B/s]"
|
| 180 |
-
]
|
| 181 |
-
},
|
| 182 |
-
"metadata": {},
|
| 183 |
-
"output_type": "display_data"
|
| 184 |
-
},
|
| 185 |
-
{
|
| 186 |
-
"data": {
|
| 187 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 188 |
-
"model_id": "9188715aa52d48a2b54b6b89a015f1da",
|
| 189 |
-
"version_major": 2,
|
| 190 |
-
"version_minor": 0
|
| 191 |
-
},
|
| 192 |
-
"text/plain": [
|
| 193 |
-
"tokenizer_config.json: 0%| | 0.00/1.47k [00:00<?, ?B/s]"
|
| 194 |
-
]
|
| 195 |
-
},
|
| 196 |
-
"metadata": {},
|
| 197 |
-
"output_type": "display_data"
|
| 198 |
-
},
|
| 199 |
-
{
|
| 200 |
-
"data": {
|
| 201 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 202 |
-
"model_id": "f457beef2cdf4ecca076b589a95edf2b",
|
| 203 |
-
"version_major": 2,
|
| 204 |
-
"version_minor": 0
|
| 205 |
-
},
|
| 206 |
-
"text/plain": [
|
| 207 |
-
"vocab.json: 0%| | 0.00/138 [00:00<?, ?B/s]"
|
| 208 |
-
]
|
| 209 |
-
},
|
| 210 |
-
"metadata": {},
|
| 211 |
-
"output_type": "display_data"
|
| 212 |
-
},
|
| 213 |
-
{
|
| 214 |
-
"data": {
|
| 215 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 216 |
-
"model_id": "b900fc305af84983b820e385d239dc29",
|
| 217 |
-
"version_major": 2,
|
| 218 |
-
"version_minor": 0
|
| 219 |
-
},
|
| 220 |
-
"text/plain": [
|
| 221 |
-
"special_tokens_map.json: 0%| | 0.00/149 [00:00<?, ?B/s]"
|
| 222 |
-
]
|
| 223 |
-
},
|
| 224 |
-
"metadata": {},
|
| 225 |
-
"output_type": "display_data"
|
| 226 |
-
},
|
| 227 |
-
{
|
| 228 |
-
"name": "stderr",
|
| 229 |
-
"output_type": "stream",
|
| 230 |
-
"text": [
|
| 231 |
"Device set to use cpu\n"
|
| 232 |
]
|
| 233 |
},
|
| 234 |
-
{
|
| 235 |
-
"data": {
|
| 236 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 237 |
-
"model_id": "7fbd5f15218142e1b4a14474e96189b8",
|
| 238 |
-
"version_major": 2,
|
| 239 |
-
"version_minor": 0
|
| 240 |
-
},
|
| 241 |
-
"text/plain": [
|
| 242 |
-
"tokenizer_config.json: 0%| | 0.00/1.49k [00:00<?, ?B/s]"
|
| 243 |
-
]
|
| 244 |
-
},
|
| 245 |
-
"metadata": {},
|
| 246 |
-
"output_type": "display_data"
|
| 247 |
-
},
|
| 248 |
-
{
|
| 249 |
-
"data": {
|
| 250 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 251 |
-
"model_id": "c3ce29bcfd6b4f0681b0ac94809ef9ab",
|
| 252 |
-
"version_major": 2,
|
| 253 |
-
"version_minor": 0
|
| 254 |
-
},
|
| 255 |
-
"text/plain": [
|
| 256 |
-
"vocab.json: 0%| | 0.00/693 [00:00<?, ?B/s]"
|
| 257 |
-
]
|
| 258 |
-
},
|
| 259 |
-
"metadata": {},
|
| 260 |
-
"output_type": "display_data"
|
| 261 |
-
},
|
| 262 |
{
|
| 263 |
"name": "stdout",
|
| 264 |
"output_type": "stream",
|
| 265 |
"text": [
|
| 266 |
-
"Inference + decoding time:
|
| 267 |
]
|
| 268 |
}
|
| 269 |
],
|
|
@@ -302,7 +189,7 @@
|
|
| 302 |
},
|
| 303 |
{
|
| 304 |
"cell_type": "code",
|
| 305 |
-
"execution_count":
|
| 306 |
"id": "959cf79f",
|
| 307 |
"metadata": {},
|
| 308 |
"outputs": [
|
|
@@ -336,7 +223,7 @@
|
|
| 336 |
},
|
| 337 |
{
|
| 338 |
"cell_type": "code",
|
| 339 |
-
"execution_count":
|
| 340 |
"id": "84f013f6",
|
| 341 |
"metadata": {},
|
| 342 |
"outputs": [
|
|
@@ -379,14 +266,14 @@
|
|
| 379 |
},
|
| 380 |
{
|
| 381 |
"cell_type": "code",
|
| 382 |
-
"execution_count":
|
| 383 |
"id": "0904a5cb",
|
| 384 |
"metadata": {},
|
| 385 |
"outputs": [
|
| 386 |
{
|
| 387 |
"data": {
|
| 388 |
"text/html": [
|
| 389 |
-
"<div id=\"
|
| 390 |
],
|
| 391 |
"text/plain": [
|
| 392 |
"<IPython.core.display.HTML object>"
|
|
@@ -397,7 +284,7 @@
|
|
| 397 |
},
|
| 398 |
{
|
| 399 |
"data": {
|
| 400 |
-
"application/javascript": "window.igv.MessageHandler.on({\"id\": \"
|
| 401 |
"text/plain": [
|
| 402 |
"<IPython.core.display.Javascript object>"
|
| 403 |
]
|
|
@@ -407,7 +294,7 @@
|
|
| 407 |
},
|
| 408 |
{
|
| 409 |
"data": {
|
| 410 |
-
"application/javascript": "window.igv.MessageHandler.on({\"id\": \"
|
| 411 |
"text/plain": [
|
| 412 |
"<IPython.core.display.Javascript object>"
|
| 413 |
]
|
|
@@ -417,7 +304,7 @@
|
|
| 417 |
},
|
| 418 |
{
|
| 419 |
"data": {
|
| 420 |
-
"application/javascript": "window.igv.MessageHandler.on({\"id\": \"
|
| 421 |
"text/plain": [
|
| 422 |
"<IPython.core.display.Javascript object>"
|
| 423 |
]
|
|
@@ -428,10 +315,10 @@
|
|
| 428 |
{
|
| 429 |
"data": {
|
| 430 |
"text/plain": [
|
| 431 |
-
"<igv_notebook.browser.Browser at
|
| 432 |
]
|
| 433 |
},
|
| 434 |
-
"execution_count":
|
| 435 |
"metadata": {},
|
| 436 |
"output_type": "execute_result"
|
| 437 |
}
|
|
|
|
| 29 |
"execution_count": 1,
|
| 30 |
"id": "2e2f5963",
|
| 31 |
"metadata": {},
|
| 32 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"source": [
|
| 34 |
"# Install dependencies\n",
|
| 35 |
"!pip -q install \"transformers>=4.55\" \"huggingface_hub>=0.23\" safetensors torch pyfaidx requests seaborn matplotlib igv_notebook"
|
|
|
|
| 118 |
},
|
| 119 |
{
|
| 120 |
"cell_type": "code",
|
| 121 |
+
"execution_count": 5,
|
| 122 |
"id": "4857d15c",
|
| 123 |
"metadata": {},
|
| 124 |
"outputs": [
|
| 125 |
{
|
| 126 |
"data": {
|
| 127 |
"application/vnd.jupyter.widget-view+json": {
|
| 128 |
+
"model_id": "cead875ae8c34250b6929e22283652e1",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
"version_major": 2,
|
| 130 |
"version_minor": 0
|
| 131 |
},
|
|
|
|
| 142 |
"text": [
|
| 143 |
"A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/NTv3_650M_pos:\n",
|
| 144 |
"- ntv3_gff_pipeline.py\n",
|
| 145 |
+
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
"Device set to use cpu\n"
|
| 147 |
]
|
| 148 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
{
|
| 150 |
"name": "stdout",
|
| 151 |
"output_type": "stream",
|
| 152 |
"text": [
|
| 153 |
+
"Inference + decoding time: 53.09 seconds\n"
|
| 154 |
]
|
| 155 |
}
|
| 156 |
],
|
|
|
|
| 189 |
},
|
| 190 |
{
|
| 191 |
"cell_type": "code",
|
| 192 |
+
"execution_count": 6,
|
| 193 |
"id": "959cf79f",
|
| 194 |
"metadata": {},
|
| 195 |
"outputs": [
|
|
|
|
| 223 |
},
|
| 224 |
{
|
| 225 |
"cell_type": "code",
|
| 226 |
+
"execution_count": 7,
|
| 227 |
"id": "84f013f6",
|
| 228 |
"metadata": {},
|
| 229 |
"outputs": [
|
|
|
|
| 266 |
},
|
| 267 |
{
|
| 268 |
"cell_type": "code",
|
| 269 |
+
"execution_count": 8,
|
| 270 |
"id": "0904a5cb",
|
| 271 |
"metadata": {},
|
| 272 |
"outputs": [
|
| 273 |
{
|
| 274 |
"data": {
|
| 275 |
"text/html": [
|
| 276 |
+
"<div id=\"jb_471625_buttons\"></div><div id=\"jb_471625_igvcontainer\"></div>"
|
| 277 |
],
|
| 278 |
"text/plain": [
|
| 279 |
"<IPython.core.display.HTML object>"
|
|
|
|
| 284 |
},
|
| 285 |
{
|
| 286 |
"data": {
|
| 287 |
+
"application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_471625\", \"command\": \"createBrowser\", \"data\": {\"genome\": \"hg38\", \"locus\": \"chr19:6700000-6831072\", \"id\": \"jb_471625\"}})",
|
| 288 |
"text/plain": [
|
| 289 |
"<IPython.core.display.Javascript object>"
|
| 290 |
]
|
|
|
|
| 294 |
},
|
| 295 |
{
|
| 296 |
"data": {
|
| 297 |
+
"application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_471625\", \"command\": \"loadTrack\", \"data\": {\"name\": \"NTv3 annotations\", \"format\": \"gff3\", \"type\": \"annotation\", \"url\": \"NTv3_650M_pos_hg38_chr19_6700000_6831072.gff3\"}})",
|
| 298 |
"text/plain": [
|
| 299 |
"<IPython.core.display.Javascript object>"
|
| 300 |
]
|
|
|
|
| 304 |
},
|
| 305 |
{
|
| 306 |
"data": {
|
| 307 |
+
"application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_471625\", \"command\": \"search\", \"data\": \"chr19:6700000-6831072\"})",
|
| 308 |
"text/plain": [
|
| 309 |
"<IPython.core.display.Javascript object>"
|
| 310 |
]
|
|
|
|
| 315 |
{
|
| 316 |
"data": {
|
| 317 |
"text/plain": [
|
| 318 |
+
"<igv_notebook.browser.Browser at 0x30d4e3e50>"
|
| 319 |
]
|
| 320 |
},
|
| 321 |
+
"execution_count": 8,
|
| 322 |
"metadata": {},
|
| 323 |
"output_type": "execute_result"
|
| 324 |
}
|
notebooks_pipelines/NTv3_650M_pos_hg38_chr19_6700000_6831072.gff3
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##gff-version 3
|
| 2 |
+
# model: InstaDeepAI/NTv3_650M_pos
|
| 3 |
+
# window: chr19:6700000-6831072 (hg38); predictions on central 37.5%: chr19:6740960-6790112
|
| 4 |
+
chr19 NTv3_HMM intron 6740961 6740995 0.975 . . ID=INTRON_1;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 5 |
+
chr19 NTv3_HMM start_codon 6740996 6741013 0.355 . . ID=START_CODON_2;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
|
| 6 |
+
chr19 NTv3_HMM exon 6741014 6741124 0.673 . . ID=EXON_3;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 7 |
+
chr19 NTv3_HMM splice_donor_site 6741125 6741125 0.857 . . ID=SPLICE_DONOR_4;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 8 |
+
chr19 NTv3_HMM intron 6741126 6741224 0.974 . . ID=INTRON_5;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 9 |
+
chr19 NTv3_HMM splice_acceptor_site 6741225 6741225 0.930 . . ID=SPLICE_ACCEPTOR_6;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 10 |
+
chr19 NTv3_HMM exon 6741226 6741280 0.693 . . ID=EXON_7;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 11 |
+
chr19 NTv3_HMM splice_donor_site 6741281 6741281 0.837 . . ID=SPLICE_DONOR_8;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 12 |
+
chr19 NTv3_HMM intron 6741282 6742966 0.959 . . ID=INTRON_9;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 13 |
+
chr19 NTv3_HMM splice_acceptor_site 6742967 6742967 0.958 . . ID=SPLICE_ACCEPTOR_10;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 14 |
+
chr19 NTv3_HMM exon 6742968 6743113 0.841 . . ID=EXON_11;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 15 |
+
chr19 NTv3_HMM splice_donor_site 6743114 6743114 0.779 . . ID=SPLICE_DONOR_12;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 16 |
+
chr19 NTv3_HMM intron 6743115 6743193 0.963 . . ID=INTRON_13;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 17 |
+
chr19 NTv3_HMM splice_acceptor_site 6743194 6743194 0.910 . . ID=SPLICE_ACCEPTOR_14;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 18 |
+
chr19 NTv3_HMM exon 6743195 6743255 0.845 . . ID=EXON_15;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 19 |
+
chr19 NTv3_HMM splice_donor_site 6743256 6743256 0.782 . . ID=SPLICE_DONOR_16;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 20 |
+
chr19 NTv3_HMM intron 6743257 6743493 0.970 . . ID=INTRON_17;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 21 |
+
chr19 NTv3_HMM splice_acceptor_site 6743494 6743494 0.780 . . ID=SPLICE_ACCEPTOR_18;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 22 |
+
chr19 NTv3_HMM exon 6743495 6743597 0.876 . . ID=EXON_19;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 23 |
+
chr19 NTv3_HMM splice_donor_site 6743598 6743598 0.856 . . ID=SPLICE_DONOR_20;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 24 |
+
chr19 NTv3_HMM intron 6743599 6743707 0.951 . . ID=INTRON_21;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 25 |
+
chr19 NTv3_HMM splice_acceptor_site 6743708 6743708 0.856 . . ID=SPLICE_ACCEPTOR_22;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 26 |
+
chr19 NTv3_HMM exon 6743709 6743835 0.812 . . ID=EXON_23;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 27 |
+
chr19 NTv3_HMM splice_donor_site 6743836 6743836 0.887 . . ID=SPLICE_DONOR_24;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 28 |
+
chr19 NTv3_HMM intron 6743837 6744553 0.989 . . ID=INTRON_25;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 29 |
+
chr19 NTv3_HMM splice_acceptor_site 6744554 6744554 0.972 . . ID=SPLICE_ACCEPTOR_26;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 30 |
+
chr19 NTv3_HMM exon 6744555 6744700 0.977 . . ID=EXON_27;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 31 |
+
chr19 NTv3_HMM intron 6744701 6744799 0.972 . . ID=INTRON_28;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 32 |
+
chr19 NTv3_HMM splice_acceptor_site 6744800 6744800 0.954 . . ID=SPLICE_ACCEPTOR_29;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 33 |
+
chr19 NTv3_HMM exon 6744801 6744993 0.977 . . ID=EXON_30;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 34 |
+
chr19 NTv3_HMM splice_donor_site 6744994 6744994 0.886 . . ID=SPLICE_DONOR_31;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 35 |
+
chr19 NTv3_HMM intron 6744995 6746451 0.979 . . ID=INTRON_32;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 36 |
+
chr19 NTv3_HMM splice_acceptor_site 6746452 6746452 0.938 . . ID=SPLICE_ACCEPTOR_33;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 37 |
+
chr19 NTv3_HMM exon 6746453 6746560 0.840 . . ID=EXON_34;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 38 |
+
chr19 NTv3_HMM splice_donor_site 6746561 6746561 0.947 . . ID=SPLICE_DONOR_35;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 39 |
+
chr19 NTv3_HMM intron 6746562 6749933 0.973 . . ID=INTRON_36;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 40 |
+
chr19 NTv3_HMM splice_acceptor_site 6749934 6749934 0.693 . . ID=SPLICE_ACCEPTOR_37;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 41 |
+
chr19 NTv3_HMM exon 6749935 6750065 0.918 . . ID=EXON_38;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 42 |
+
chr19 NTv3_HMM splice_donor_site 6750066 6750066 0.783 . . ID=SPLICE_DONOR_39;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 43 |
+
chr19 NTv3_HMM intron 6750067 6750291 0.955 . . ID=INTRON_40;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 44 |
+
chr19 NTv3_HMM splice_acceptor_site 6750292 6750292 0.960 . . ID=SPLICE_ACCEPTOR_41;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 45 |
+
chr19 NTv3_HMM exon 6750293 6750430 0.959 . . ID=EXON_42;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 46 |
+
chr19 NTv3_HMM splice_donor_site 6750431 6750431 0.723 . . ID=SPLICE_DONOR_43;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 47 |
+
chr19 NTv3_HMM intron 6750432 6750511 0.939 . . ID=INTRON_44;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 48 |
+
chr19 NTv3_HMM splice_acceptor_site 6750512 6750512 0.750 . . ID=SPLICE_ACCEPTOR_45;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 49 |
+
chr19 NTv3_HMM exon 6750513 6750632 0.902 . . ID=EXON_46;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 50 |
+
chr19 NTv3_HMM splice_donor_site 6750633 6750633 0.917 . . ID=SPLICE_DONOR_47;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 51 |
+
chr19 NTv3_HMM intron 6750634 6751062 0.961 . . ID=INTRON_48;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 52 |
+
chr19 NTv3_HMM splice_acceptor_site 6751063 6751063 0.694 . . ID=SPLICE_ACCEPTOR_49;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 53 |
+
chr19 NTv3_HMM exon 6751064 6751199 0.558 . . ID=EXON_50;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 54 |
+
chr19 NTv3_HMM stop_codon 6751200 6751212 0.332 . . ID=STOP_CODON_51;Name=STOP_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=220,20,60
|
| 55 |
+
chr19 NTv3_HMM three_prime_UTR 6751213 6751488 0.965 + . ID=UTR3_PLUS_52;Name=UTR3_PLUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=34,139,34
|
| 56 |
+
chr19 NTv3_HMM polyA_signal 6751489 6751507 0.355 . . ID=POLYA_SIGNAL_53;Name=POLYA_SIGNAL;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=139,69,19
|
| 57 |
+
chr19 NTv3_HMM start_codon 6751508 6752169 0.002 . . ID=START_CODON_54;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
|
| 58 |
+
chr19 NTv3_HMM polyA_signal 6752170 6752187 0.432 . . ID=POLYA_SIGNAL_55;Name=POLYA_SIGNAL;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=139,69,19
|
| 59 |
+
chr19 NTv3_HMM three_prime_UTR 6752188 6752571 0.839 - . ID=UTR3_MINUS_56;Name=UTR3_MINUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=34,139,34
|
| 60 |
+
chr19 NTv3_HMM stop_codon 6752572 6752752 0.136 . . ID=STOP_CODON_57;Name=STOP_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=220,20,60
|
| 61 |
+
chr19 NTv3_HMM splice_acceptor_site 6752753 6752753 0.798 . . ID=SPLICE_ACCEPTOR_58;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 62 |
+
chr19 NTv3_HMM intron 6752754 6753455 0.910 . . ID=INTRON_59;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 63 |
+
chr19 NTv3_HMM splice_donor_site 6753456 6753456 0.766 . . ID=SPLICE_DONOR_60;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 64 |
+
chr19 NTv3_HMM exon 6753457 6753640 0.953 . . ID=EXON_61;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 65 |
+
chr19 NTv3_HMM splice_acceptor_site 6753641 6753641 0.939 . . ID=SPLICE_ACCEPTOR_62;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 66 |
+
chr19 NTv3_HMM intron 6753642 6754051 0.985 . . ID=INTRON_63;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 67 |
+
chr19 NTv3_HMM splice_donor_site 6754052 6754052 0.844 . . ID=SPLICE_DONOR_64;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 68 |
+
chr19 NTv3_HMM exon 6754053 6754161 0.908 . . ID=EXON_65;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 69 |
+
chr19 NTv3_HMM splice_acceptor_site 6754162 6754163 0.633 . . ID=SPLICE_ACCEPTOR_66;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 70 |
+
chr19 NTv3_HMM intron 6754164 6754250 0.962 . . ID=INTRON_67;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 71 |
+
chr19 NTv3_HMM splice_donor_site 6754251 6754251 0.875 . . ID=SPLICE_DONOR_68;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 72 |
+
chr19 NTv3_HMM exon 6754252 6754424 0.965 . . ID=EXON_69;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 73 |
+
chr19 NTv3_HMM splice_acceptor_site 6754425 6754425 0.791 . . ID=SPLICE_ACCEPTOR_70;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 74 |
+
chr19 NTv3_HMM intron 6754426 6754615 0.975 . . ID=INTRON_71;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 75 |
+
chr19 NTv3_HMM splice_donor_site 6754616 6754616 0.953 . . ID=SPLICE_DONOR_72;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 76 |
+
chr19 NTv3_HMM exon 6754617 6754730 0.731 . . ID=EXON_73;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 77 |
+
chr19 NTv3_HMM splice_acceptor_site 6754731 6754731 0.822 . . ID=SPLICE_ACCEPTOR_74;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 78 |
+
chr19 NTv3_HMM intron 6754732 6754830 0.975 . . ID=INTRON_75;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 79 |
+
chr19 NTv3_HMM splice_donor_site 6754831 6754831 0.944 . . ID=SPLICE_DONOR_76;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 80 |
+
chr19 NTv3_HMM exon 6754832 6755314 0.757 . . ID=EXON_77;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 81 |
+
chr19 NTv3_HMM splice_acceptor_site 6755315 6755315 0.713 . . ID=SPLICE_ACCEPTOR_78;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 82 |
+
chr19 NTv3_HMM intron 6755316 6759593 0.988 . . ID=INTRON_79;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 83 |
+
chr19 NTv3_HMM splice_donor_site 6759594 6759594 0.928 . . ID=SPLICE_DONOR_80;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 84 |
+
chr19 NTv3_HMM exon 6759595 6759669 0.840 . . ID=EXON_81;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 85 |
+
chr19 NTv3_HMM splice_acceptor_site 6759670 6759670 0.901 . . ID=SPLICE_ACCEPTOR_82;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 86 |
+
chr19 NTv3_HMM intron 6759671 6760637 0.985 . . ID=INTRON_83;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 87 |
+
chr19 NTv3_HMM splice_donor_site 6760638 6760638 0.928 . . ID=SPLICE_DONOR_84;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 88 |
+
chr19 NTv3_HMM exon 6760639 6760985 0.748 . . ID=EXON_85;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 89 |
+
chr19 NTv3_HMM splice_acceptor_site 6760986 6760987 0.603 . . ID=SPLICE_ACCEPTOR_86;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 90 |
+
chr19 NTv3_HMM intron 6760988 6763679 0.984 . . ID=INTRON_87;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 91 |
+
chr19 NTv3_HMM splice_donor_site 6763680 6763680 0.759 . . ID=SPLICE_DONOR_88;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 92 |
+
chr19 NTv3_HMM exon 6763681 6763732 0.663 . . ID=EXON_89;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 93 |
+
chr19 NTv3_HMM five_prime_UTR 6763733 6763815 0.840 - . ID=UTR5_MINUS_90;Name=UTR5_MINUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
|
| 94 |
+
chr19 NTv3_HMM splice_acceptor_site 6763816 6763816 0.869 . . ID=SPLICE_ACCEPTOR_91;Name=SPLICE_ACCEPTOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 95 |
+
chr19 NTv3_HMM intron 6763817 6767386 0.976 . . ID=INTRON_92;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
| 96 |
+
chr19 NTv3_HMM splice_donor_site 6767387 6767387 0.902 . . ID=SPLICE_DONOR_93;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 97 |
+
chr19 NTv3_HMM start_codon 6767388 6767411 0.051 . . ID=START_CODON_94;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
|
| 98 |
+
chr19 NTv3_HMM five_prime_UTR 6767412 6767514 0.578 - . ID=UTR5_MINUS_95;Name=UTR5_MINUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
|
| 99 |
+
chr19 NTv3_HMM start_codon 6767515 6769347 0.009 . . ID=START_CODON_96;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
|
| 100 |
+
chr19 NTv3_HMM TF_binding_site 6769348 6769521 0.506 . . ID=CTCF_97;Name=CTCF;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=148,0,211
|
| 101 |
+
chr19 NTv3_HMM start_codon 6769522 6772696 0.002 . . ID=START_CODON_98;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
|
| 102 |
+
chr19 NTv3_HMM five_prime_UTR 6772697 6772806 0.885 + . ID=UTR5_PLUS_99;Name=UTR5_PLUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
|
| 103 |
+
chr19 NTv3_HMM start_codon 6772807 6772810 0.694 . . ID=START_CODON_100;Name=START_CODON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,191,255
|
| 104 |
+
chr19 NTv3_HMM five_prime_UTR 6772811 6772922 0.748 + . ID=UTR5_PLUS_101;Name=UTR5_PLUS;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,128,0
|
| 105 |
+
chr19 NTv3_HMM exon 6772923 6773010 0.635 . . ID=EXON_102;Name=EXON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,255
|
| 106 |
+
chr19 NTv3_HMM splice_donor_site 6773011 6773011 0.884 . . ID=SPLICE_DONOR_103;Name=SPLICE_DONOR;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072;color=0,0,0
|
| 107 |
+
chr19 NTv3_HMM intron 6773012 6790112 0.972 . . ID=INTRON_104;Name=INTRON;model=InstaDeepAI/NTv3_650M_pos;assembly=hg38;window=chr19:6700000-6831072
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
transformers>=4.55.0
|
| 4 |
+
accelerate>=0.20.0
|
| 5 |
+
safetensors>=0.3.0
|
| 6 |
+
huggingface_hub>=0.23.0
|
| 7 |
+
|
tabs/demo.html
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="summary">
|
| 2 |
+
<h2>💻 Interactive Code Demo</h2>
|
| 3 |
+
<p>Run the NTv3 650M post-trained model interactively to predict functional tracks and genomic elements for any genomic region.</p>
|
| 4 |
+
<p><strong>Model:</strong> <code>InstaDeepAI/NTv3_650M_pos</code></p>
|
| 5 |
+
</div>
|
| 6 |
+
|
| 7 |
+
<div class="grid">
|
| 8 |
+
<div class="card" style="grid-column: span 12;">
|
| 9 |
+
<h2>🚀 NTv3 Track Prediction Pipeline</h2>
|
| 10 |
+
<p>Enter a genomic region to get predictions for functional tracks and genomic elements. The model will predict ~7k functional tracks and 21 genomic elements over the center 37.5% of your input region.</p>
|
| 11 |
+
|
| 12 |
+
<!-- Gradio app embedded here -->
|
| 13 |
+
<!-- Note: With Gradio SDK, the app.py serves as the main interface -->
|
| 14 |
+
<!-- The HTML interface can still be accessed, but the Gradio demo is the primary interface -->
|
| 15 |
+
<div id="gradio-container" style="margin-top: 20px; min-height: 600px;">
|
| 16 |
+
<p style="color: var(--muted); margin-bottom: 15px;">
|
| 17 |
+
<strong>Note:</strong> With Gradio SDK enabled, the interactive demo is now the main interface of this Space.
|
| 18 |
+
You can interact with it directly, or use the code example below to run predictions programmatically.
|
| 19 |
+
</p>
|
| 20 |
+
<div style="background: rgba(0,0,0,0.3); padding: 20px; border-radius: 12px; border: 1px solid var(--border);">
|
| 21 |
+
<p style="color: var(--link); margin: 0;">
|
| 22 |
+
💡 The Gradio interactive demo is now available as the main interface of this Space.
|
| 23 |
+
Refresh the page to see it, or use the code example below.
|
| 24 |
+
</p>
|
| 25 |
+
</div>
|
| 26 |
+
</div>
|
| 27 |
+
|
| 28 |
+
<p style="margin-top: 20px; color: var(--muted); font-size: 13px;">
|
| 29 |
+
<strong>Note:</strong> The first run may take longer as the model loads. Maximum region size: 1 Mb (1,000,000 base pairs).
|
| 30 |
+
</p>
|
| 31 |
+
</div>
|
| 32 |
+
|
| 33 |
+
<div class="card" style="grid-column: span 12;">
|
| 34 |
+
<h2>📝 Code Example</h2>
|
| 35 |
+
<p>Here's the Python code that powers the demo above. You can run this in a notebook or Python script:</p>
|
| 36 |
+
<div class="code"><pre><code class="language-python">from transformers import pipeline
|
| 37 |
+
import torch
|
| 38 |
+
|
| 39 |
+
model_name = "InstaDeepAI/NTv3_650M_pos"
|
| 40 |
+
|
| 41 |
+
ntv3_tracks = pipeline(
|
| 42 |
+
"ntv3-tracks",
|
| 43 |
+
model=model_name,
|
| 44 |
+
trust_remote_code=True,
|
| 45 |
+
device=0 if torch.cuda.is_available() else -1,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Run track prediction
|
| 49 |
+
out = ntv3_tracks(
|
| 50 |
+
{
|
| 51 |
+
"chrom": "chr19",
|
| 52 |
+
"start": 6_700_000,
|
| 53 |
+
"end": 6_831_072,
|
| 54 |
+
"species": "human"
|
| 55 |
+
}
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Print output shapes
|
| 59 |
+
# 7k human tracks over 37.5 % center region of the input sequence
|
| 60 |
+
print("bigwig_tracks_logits:", tuple(out.bigwig_tracks_logits.shape))
|
| 61 |
+
# Location of 21 genomic elements over 37.5 % center region of the input sequence
|
| 62 |
+
print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape))
|
| 63 |
+
# Language model logits for whole sequence over vocabulary
|
| 64 |
+
print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div>
|
| 65 |
+
<p style="margin-top: 15px;">To run the interactive Gradio app locally:</p>
|
| 66 |
+
<div class="code"><pre><code class="language-bash">pip install -r requirements.txt
|
| 67 |
+
python app.py</code></pre></div>
|
| 68 |
+
</div>
|
| 69 |
+
</div>
|
| 70 |
+
|
| 71 |
+
<script>
|
| 72 |
+
// Try to detect if Gradio app is available
|
| 73 |
+
window.addEventListener('load', function() {
|
| 74 |
+
const iframe = document.getElementById('gradio-iframe');
|
| 75 |
+
iframe.onerror = function() {
|
| 76 |
+
// If iframe fails to load, keep showing the instructions
|
| 77 |
+
document.getElementById('gradio-loading').style.display = 'block';
|
| 78 |
+
iframe.style.display = 'none';
|
| 79 |
+
};
|
| 80 |
+
// Set a timeout to show instructions if iframe doesn't load
|
| 81 |
+
setTimeout(function() {
|
| 82 |
+
if (iframe.style.display === 'none') {
|
| 83 |
+
document.getElementById('gradio-loading').style.display = 'block';
|
| 84 |
+
}
|
| 85 |
+
}, 2000);
|
| 86 |
+
});
|
| 87 |
+
</script>
|
| 88 |
+
|
tabs/home.html
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="summary">
|
| 2 |
+
<h2>📖 About NTv3</h2>
|
| 3 |
+
<p>
|
| 4 |
+
NTv3 is a multi-species genomic foundation model family that unifies representation learning, functional-track prediction, genome annotation, and controllable sequence generation within a single U-Net-style backbone. It models up to 1 Mb of DNA at single-base resolution, using a conv–Transformer–deconv architecture that efficiently captures both local motifs and long-range regulatory dependencies. NTv3 is first pretrained on ~9T base pairs from the OpenGenome2 corpus spanning >128k species using masked language modeling, and then post-trained with a joint objective on ~16k functional tracks and annotation labels across 24 animal and plant species, enabling state-of-the-art cross-species functional prediction and base-resolution genome annotation.
|
| 5 |
+
</p>
|
| 6 |
+
<p>
|
| 7 |
+
Beyond prediction, NTv3 can be fine-tuned into a controllable generative model via masked-diffusion language modeling, allowing targeted design of regulatory sequences (for example, enhancers with specified activity and promoter selectivity) that have been validated experimentally.
|
| 8 |
+
</p>
|
| 9 |
+
</div>
|
| 10 |
+
|
| 11 |
+
<div class="paper-summary">
|
| 12 |
+
<!-- <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2> -->
|
| 13 |
+
<img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
|
| 14 |
+
</div>
|
| 15 |
+
|
| 16 |
+
<div class="why-ntv3">
|
| 17 |
+
<h2>✨ Why NTv3?</h2>
|
| 18 |
+
<ul>
|
| 19 |
+
<li>📏 <strong>1 Mb long context at nucleotide resolution</strong> — ~100× longer than typical genomics models.</li>
|
| 20 |
+
<li>🏗️ <strong>Unified architecture</strong> for: masked language modeling, functional-track prediction, genome annotation, and sequence generation.</li>
|
| 21 |
+
<li>🌍 <strong>Cross-species generalization</strong> across 24 animals + plants with a shared conditioned representation space.</li>
|
| 22 |
+
<li>⚡ <strong>U-Net–style architecture</strong> improves stability and GPU efficiency on very long sequences.</li>
|
| 23 |
+
<li>🎯 <strong>Controllable generative modeling</strong>, enabling targeted enhancer/promoter engineering validated by experimental assays.</li>
|
| 24 |
+
</ul>
|
| 25 |
+
</div>
|
| 26 |
+
|
| 27 |
+
<div class="grid">
|
| 28 |
+
<div class="card">
|
| 29 |
+
<h2>🤖 Models (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>)</h2>
|
| 30 |
+
<ul>
|
| 31 |
+
<li>📦 Pretrained checkpoints:
|
| 32 |
+
<div style="margin-top: 8px; margin-left: 0;">
|
| 33 |
+
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_8M_pre"><code>InstaDeepAI/NTv3_8M_pre</code></a></div>
|
| 34 |
+
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pre"><code>InstaDeepAI/NTv3_100M_pre</code></a></div>
|
| 35 |
+
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pre"><code>InstaDeepAI/NTv3_650M_pre</code></a></div>
|
| 36 |
+
</div>
|
| 37 |
+
</li>
|
| 38 |
+
<li>🎯 Post-trained checkpoints:
|
| 39 |
+
<div style="margin-top: 8px; margin-left: 0;">
|
| 40 |
+
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pos"><code>InstaDeepAI/NTv3_100M_pos</code></a></div>
|
| 41 |
+
<div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pos"><code>InstaDeepAI/NTv3_650M_pos</code></a></div>
|
| 42 |
+
</div>
|
| 43 |
+
</li>
|
| 44 |
+
</ul>
|
| 45 |
+
<table>
|
| 46 |
+
<thead>
|
| 47 |
+
<tr>
|
| 48 |
+
<th>Model</th>
|
| 49 |
+
<th>Size</th>
|
| 50 |
+
<th>Pre-training</th>
|
| 51 |
+
<th>Post-training</th>
|
| 52 |
+
<th>Tasks</th>
|
| 53 |
+
</tr>
|
| 54 |
+
</thead>
|
| 55 |
+
<tbody>
|
| 56 |
+
<tr>
|
| 57 |
+
<td><strong>NTv3-8M</strong></td>
|
| 58 |
+
<td>8M params</td>
|
| 59 |
+
<td>MLM</td>
|
| 60 |
+
<td>❌</td>
|
| 61 |
+
<td>Embeddings, light inference</td>
|
| 62 |
+
</tr>
|
| 63 |
+
<tr>
|
| 64 |
+
<td><strong>NTv3-100M</strong></td>
|
| 65 |
+
<td>100M params</td>
|
| 66 |
+
<td>MLM</td>
|
| 67 |
+
<td><span class="checkmark">✅</span></td>
|
| 68 |
+
<td>Tracks, annotation</td>
|
| 69 |
+
</tr>
|
| 70 |
+
<tr>
|
| 71 |
+
<td><strong>NTv3-650M</strong></td>
|
| 72 |
+
<td>650M params</td>
|
| 73 |
+
<td>MLM</td>
|
| 74 |
+
<td><span class="checkmark">✅</span></td>
|
| 75 |
+
<td>Tracks, annotation, best accuracy</td>
|
| 76 |
+
</tr>
|
| 77 |
+
</tbody>
|
| 78 |
+
</table>
|
| 79 |
+
</div>
|
| 80 |
+
|
| 81 |
+
<div class="card-stack">
|
| 82 |
+
<div class="card">
|
| 83 |
+
<h2>📓 Tutorial notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_tutorials" target="_blank" rel="noopener">folder</a>)</h2>
|
| 84 |
+
<ul>
|
| 85 |
+
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/00_quickstart_inference.ipynb" target="_blank" rel="noopener">🚀 00 — Quickstart inference</a></li>
|
| 86 |
+
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb" target="_blank" rel="noopener">📊 01 — Tracks prediction</a></li>
|
| 87 |
+
<li>🎯 02 — Fine-tune on bigwig tracks</li>
|
| 88 |
+
<li>🔍 03 — Model interpretation</li>
|
| 89 |
+
<li>🧪 04 — Training NTv3 generative </li>
|
| 90 |
+
</ul>
|
| 91 |
+
</div>
|
| 92 |
+
<div class="card">
|
| 93 |
+
<h2>📓 Pipelines notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_pipelines" target="_blank" rel="noopener">folder</a>)</h2>
|
| 94 |
+
<ul>
|
| 95 |
+
<li> 🎯 01 — Generate bigwig predictions for certain tracks</li>
|
| 96 |
+
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/02_genome_annotation.ipynb" target="_blank" rel="noopener">🏷️ 02 — Genome annotation / segmentation</a></li>
|
| 97 |
+
<li>🎯 03 — Fine-tune on bigwig tracks</li>
|
| 98 |
+
<li>🔍 04 — Interpret a given genomic region</li>
|
| 99 |
+
<li>🧪 05 — Sequence generation</li>
|
| 100 |
+
</ul>
|
| 101 |
+
</div>
|
| 102 |
+
<div class="card">
|
| 103 |
+
<h2>🔗 Links</h2>
|
| 104 |
+
<ul>
|
| 105 |
+
<li>📄 Paper: (add link)</li>
|
| 106 |
+
<li><a href="https://github.com/instadeepai/nucleotide-transformer">💻 JAX model code (GitHub)</a></li>
|
| 107 |
+
<li><a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">🎯 HF Model Collection (all NTv3 models)</a></li>
|
| 108 |
+
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">📓 All notebooks</a></li>
|
| 109 |
+
<li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3_benchmark" target="_blank" rel="noopener">🏆 NTv3 benchmark leaderboard</a></li>
|
| 110 |
+
</ul>
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
|
| 114 |
+
<div class="card">
|
| 115 |
+
<h2>🤖 Load a pre-trained model</h2>
|
| 116 |
+
<p>Here is an example of how to load and use a pre-trained NTv3 model.</p>
|
| 117 |
+
<div class="code"><pre><code class="language-python">from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 118 |
+
|
| 119 |
+
model_name = "InstaDeepAI/NTv3_650M_pre"
|
| 120 |
+
|
| 121 |
+
# Load model and tokenizer
|
| 122 |
+
model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)
|
| 123 |
+
tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 124 |
+
|
| 125 |
+
# Tokenize input sequences
|
| 126 |
+
batch = tok(["ATCGNATCG", "ACGT"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")
|
| 127 |
+
|
| 128 |
+
# Run model
|
| 129 |
+
out = model(
|
| 130 |
+
**batch,
|
| 131 |
+
output_hidden_states=True,
|
| 132 |
+
output_attentions=True
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Print output shapes
|
| 136 |
+
print(out.logits.shape) # (B, L, V = 11)
|
| 137 |
+
print(len(out.hidden_states)) # convs + transformers + deconvs
|
| 138 |
+
print(len(out.attentions)) # equals transformer layers = 12
|
| 139 |
+
</code></pre></div>
|
| 140 |
+
<p>Model embeddings can be used for fine-tuning on downstream tasks.</p>
|
| 141 |
+
|
| 142 |
+
<p style="margin-top: 40px;">TO DO: add pipeline for fine-tuning on functional tracks or genome annotation.</p>
|
| 143 |
+
</div>
|
| 144 |
+
|
| 145 |
+
<div class="card">
|
| 146 |
+
<h2>💻 Use a post-trained model</h2>
|
| 147 |
+
<p>Here is a quick example of how to use the post-trained NTv3 650M model to predict tracks for a human genomic window.</p>
|
| 148 |
+
<div class="code"><pre><code class="language-python">from transformers import pipeline
|
| 149 |
+
import torch
|
| 150 |
+
|
| 151 |
+
model_name = "InstaDeepAI/NTv3_650M_pos"
|
| 152 |
+
|
| 153 |
+
ntv3_tracks = pipeline(
|
| 154 |
+
"ntv3-tracks",
|
| 155 |
+
model=model_name,
|
| 156 |
+
trust_remote_code=True,
|
| 157 |
+
device=0 if torch.cuda.is_available() else -1,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# Run track prediction
|
| 161 |
+
out = ntv3_tracks(
|
| 162 |
+
{
|
| 163 |
+
"chrom": "chr19",
|
| 164 |
+
"start": 6_700_000,
|
| 165 |
+
"end": 6_831_072,
|
| 166 |
+
"species": "human"
|
| 167 |
+
}
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# Print output shapes
|
| 171 |
+
# 7k human tracks over 37.5 % center region of the input sequence
|
| 172 |
+
print("bigwig_tracks_logits:", tuple(out.bigwig_tracks_logits.shape))
|
| 173 |
+
# Location of 21 genomic elements over 37.5 % center region of the input sequence
|
| 174 |
+
print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape))
|
| 175 |
+
# Language model logits for whole sequence over vocabulary
|
| 176 |
+
print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div>
|
| 177 |
+
<p>Predictions can also be plotted for a subset of functional tracks and genomic elements:</p>
|
| 178 |
+
<div class="code"><pre><code class="language-python">tracks_to_plot = {
|
| 179 |
+
"K562 RNA-seq": "ENCSR056HPM",
|
| 180 |
+
"K562 DNAse": "ENCSR921NMD",
|
| 181 |
+
"K562 H3k4me3": "ENCSR000DWD",
|
| 182 |
+
"K562 CTCF": "ENCSR000AKO",
|
| 183 |
+
"HepG2 RNA-seq": "ENCSR561FEE_P",
|
| 184 |
+
"HepG2 DNAse": "ENCSR000EJV",
|
| 185 |
+
"HepG2 H3k4me3": "ENCSR000AMP",
|
| 186 |
+
"HepG2 CTCF": "ENCSR000BIE",
|
| 187 |
+
}
|
| 188 |
+
elements_to_plot = ["protein_coding_gene", "exon", "intron", "splice_donor", "splice_acceptor"]
|
| 189 |
+
|
| 190 |
+
out = ntv3_tracks(
|
| 191 |
+
{"chrom": "chr19", "start": 6_700_000, "end": 6_831_072, "species": "human"},
|
| 192 |
+
plot=True,
|
| 193 |
+
tracks_to_plot=tracks_to_plot,
|
| 194 |
+
elements_to_plot=elements_to_plot,
|
| 195 |
+
)</code></pre></div>
|
| 196 |
+
<img src="assets/output_tracks.png" alt="Output tracks visualization" style="max-width: 100%; margin-top: 20px;" />
|
| 197 |
+
</div>
|
| 198 |
+
</div>
|
| 199 |
+
|