Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta name="viewport" content="width=device-width,initial-scale=1" /> | |
| <title>NTv3 — Foundation Models for Long-Range Genomics</title> | |
| <meta name="description" content="NTv3 companion hub: PyTorch notebooks for inference, fine-tuning, interpretation, and sequence generation on NTv3 models hosted on Hugging Face." /> | |
| <link href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/themes/prism-tomorrow.min.css" rel="stylesheet" /> | |
| <style> | |
| :root { | |
| --bg: #0b1020; | |
| --card: rgba(255, 255, 255, 0.06); | |
| --text: rgba(255, 255, 255, 0.92); | |
| --muted: rgba(255, 255, 255, 0.65); | |
| --link: #7dd3fc; | |
| --border: rgba(255, 255, 255, 0.12); | |
| --shadow: 0 10px 30px rgba(0,0,0,0.35); | |
| --radius: 18px; | |
| --mono: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; | |
| --sans: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji","Segoe UI Emoji"; | |
| } | |
| body { | |
| margin: 0; | |
| font-family: var(--sans); | |
| color: var(--text); | |
| background: | |
| radial-gradient(1200px 800px at 10% 10%, rgba(125, 211, 252, 0.12), transparent 60%), | |
| radial-gradient(1200px 800px at 90% 30%, rgba(167, 139, 250, 0.12), transparent 55%), | |
| var(--bg); | |
| min-height: 100vh; | |
| } | |
| .wrap { max-width: 980px; margin: 0 auto; padding: 44px 18px 56px; } | |
| .hero { | |
| display: grid; gap: 14px; | |
| padding: 26px 24px; | |
| border: 1px solid var(--border); | |
| background: var(--card); | |
| box-shadow: var(--shadow); | |
| border-radius: var(--radius); | |
| } | |
| h1 { font-size: 34px; margin: 0; letter-spacing: -0.02em; } | |
| p { margin: 0; color: var(--muted); line-height: 1.5; } | |
| .grid { | |
| margin-top: 18px; | |
| display: grid; | |
| grid-template-columns: repeat(12, 1fr); | |
| gap: 14px; | |
| } | |
| .card { | |
| grid-column: span 6; | |
| padding: 18px 18px; | |
| border: 1px solid var(--border); | |
| background: var(--card); | |
| border-radius: var(--radius); | |
| box-shadow: 0 6px 18px rgba(0,0,0,0.22); | |
| } | |
| .card-stack { | |
| grid-column: span 6; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 14px; | |
| } | |
| .card-stack .card { | |
| grid-column: span 1; | |
| margin: 0; | |
| } | |
| .card h2 { margin: 0 0 10px 0; font-size: 16px; letter-spacing: 0.01em; } | |
| .card ul { margin: 0; padding-left: 18px; color: var(--muted); } | |
| .card li { margin: 8px 0; } | |
| .card table { | |
| width: 100%; | |
| margin-top: 12px; | |
| border-collapse: collapse; | |
| font-size: 13px; | |
| } | |
| .card table th { | |
| text-align: left; | |
| padding: 10px 12px; | |
| border-bottom: 2px solid var(--border); | |
| color: var(--text); | |
| font-weight: 600; | |
| font-size: 12px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .card table td { | |
| padding: 10px 12px; | |
| border-bottom: 1px solid var(--border); | |
| color: var(--muted); | |
| } | |
| .card table tr:last-child td { | |
| border-bottom: none; | |
| } | |
| .card table tr:hover { | |
| background: rgba(255, 255, 255, 0.02); | |
| } | |
| .card table td .checkmark { | |
| color: #4ade80 ; | |
| } | |
| a { color: var(--link); text-decoration: none; } | |
| a:hover { text-decoration: underline; } | |
| .pillrow { display: flex; gap: 8px; flex-wrap: wrap; margin-top: 8px; } | |
| .pill { | |
| font-size: 12px; | |
| padding: 6px 10px; | |
| border-radius: 999px; | |
| border: 1px solid var(--border); | |
| background: rgba(255,255,255,0.04); | |
| color: var(--muted); | |
| } | |
| .code { | |
| margin-top: 12px; | |
| padding: 16px 18px; | |
| border-radius: 14px; | |
| border: 1px solid var(--border); | |
| background: rgba(0,0,0,0.3); | |
| font-family: var(--mono); | |
| font-size: 13px; | |
| line-height: 1.6; | |
| overflow-x: auto; | |
| color: rgba(255,255,255,0.9); | |
| white-space: pre; | |
| } | |
| .code code { | |
| font-family: inherit; | |
| font-size: inherit; | |
| color: inherit; | |
| } | |
| /* Prism.js theme overrides to match dark theme */ | |
| .code pre[class*="language-"] { | |
| background: transparent; | |
| margin: 0; | |
| padding: 0; | |
| } | |
| .code code[class*="language-"] { | |
| background: transparent; | |
| } | |
| .summary { | |
| margin-top: 18px; | |
| padding: 24px; | |
| border: 1px solid var(--border); | |
| background: var(--card); | |
| border-radius: var(--radius); | |
| box-shadow: var(--shadow); | |
| } | |
| .summary h2 { | |
| margin: 0 0 16px 0; | |
| font-size: 18px; | |
| letter-spacing: 0.01em; | |
| } | |
| .summary p { | |
| margin: 0 0 14px 0; | |
| color: var(--muted); | |
| line-height: 1.7; | |
| } | |
| .summary p:last-child { | |
| margin-bottom: 0; | |
| } | |
| .why-ntv3 { | |
| margin-top: 18px; | |
| padding: 24px; | |
| border: 1px solid var(--border); | |
| background: var(--card); | |
| border-radius: var(--radius); | |
| box-shadow: var(--shadow); | |
| } | |
| .why-ntv3 h2 { | |
| margin: 0 0 16px 0; | |
| font-size: 18px; | |
| letter-spacing: 0.01em; | |
| } | |
| .why-ntv3 ul { | |
| margin: 0; | |
| padding-left: 0; | |
| list-style: none; | |
| color: var(--muted); | |
| } | |
| .why-ntv3 li { | |
| margin: 12px 0; | |
| padding-left: 0; | |
| line-height: 1.7; | |
| } | |
| .paper-summary { | |
| margin-top: 12px; | |
| padding: 24px; | |
| border: 1px solid var(--border); | |
| background: var(--card); | |
| border-radius: var(--radius); | |
| box-shadow: var(--shadow); | |
| } | |
| .paper-summary h2 { | |
| text-align: center; | |
| margin: 0 0 20px 0; | |
| } | |
| .paper-summary img { | |
| width: 100%; | |
| height: auto; | |
| display: block; | |
| border-radius: 12px; | |
| } | |
| .footer { margin-top: 22px; color: var(--muted); font-size: 13px; } | |
| @media (max-width: 860px) { | |
| .card { grid-column: span 12; } | |
| h1 { font-size: 28px; } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="wrap"> | |
| <div class="hero"> | |
| <h1>🧬 NTv3 — Foundation Models for Long-Range Genomics</h1> | |
| <p> | |
| This Space is the companion hub for <strong>NTv3</strong> models: runnable notebooks for inference, fine-tuning, interpretation, and sequence generation. | |
| </p> | |
| <div class="pillrow"> | |
| <span class="pill">🤖 Foundation Models</span> | |
| <span class="pill">🧬 Long-context genomics</span> | |
| <span class="pill">🌍 Multi-species</span> | |
| <span class="pill">⚡ Inference • Fine-tune • Interpret • Generate</span> | |
| <span class="pill">📓 Torch notebooks</span> | |
| </div> | |
| </div> | |
| <div class="summary"> | |
| <h2>📖 About NTv3</h2> | |
| <p> | |
| NTv3 is a multi-species genomic foundation model family that unifies representation learning, functional-track prediction, genome annotation, and controllable sequence generation within a single U-Net-style backbone. It models up to 1 Mb of DNA at single-base resolution, using a conv–Transformer–deconv architecture that efficiently captures both local motifs and long-range regulatory dependencies. NTv3 is first pretrained on ~9T base pairs from the OpenGenome2 corpus spanning >128k species using masked language modeling, and then post-trained with a joint objective on ~16k functional tracks and annotation labels across 24 animal and plant species, enabling state-of-the-art cross-species functional prediction and base-resolution genome annotation. | |
| </p> | |
| <p> | |
| Beyond prediction, NTv3 can be fine-tuned into a controllable generative model via masked-diffusion language modeling, allowing targeted design of regulatory sequences (for example, enhancers with specified activity and promoter selectivity) that have been validated experimentally. | |
| </p> | |
| </div> | |
| <div class="paper-summary"> | |
| <!-- <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2> --> | |
| <img src="assets/paper_summary.png" alt="NTv3 Paper Summary" /> | |
| </div> | |
| <div class="why-ntv3"> | |
| <h2>✨ Why NTv3?</h2> | |
| <ul> | |
| <li>📏 <strong>1 Mb long context at nucleotide resolution</strong> — ~100× longer than typical genomics models.</li> | |
| <li>🏗️ <strong>Unified architecture</strong> for: masked language modeling, functional-track prediction, genome annotation, and sequence generation.</li> | |
| <li>🌍 <strong>Cross-species generalization</strong> across 24 animals + plants with a shared conditioned representation space.</li> | |
| <li>⚡ <strong>U-Net–style architecture</strong> improves stability and GPU efficiency on very long sequences.</li> | |
| <li>🎯 <strong>Controllable generative modeling</strong>, enabling targeted enhancer/promoter engineering validated by experimental assays.</li> | |
| </ul> | |
| </div> | |
| <div class="grid"> | |
| <div class="card"> | |
| <h2>🤖 Models (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>)</h2> | |
| <ul> | |
| <li>📦 Pretrained checkpoints: | |
| <div style="margin-top: 8px; margin-left: 0;"> | |
| <div><a href="https://huggingface.co/InstaDeepAI/NTv3_8M_pre"><code>InstaDeepAI/NTv3_8M_pre</code></a></div> | |
| <div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pre"><code>InstaDeepAI/NTv3_100M_pre</code></a></div> | |
| <div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pre"><code>InstaDeepAI/NTv3_650M_pre</code></a></div> | |
| </div> | |
| </li> | |
| <li>🎯 Post-trained checkpoints: | |
| <div style="margin-top: 8px; margin-left: 0;"> | |
| <div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pos"><code>InstaDeepAI/NTv3_100M_pos</code></a></div> | |
| <div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pos"><code>InstaDeepAI/NTv3_650M_pos</code></a></div> | |
| </div> | |
| </li> | |
| </ul> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Model</th> | |
| <th>Size</th> | |
| <th>Pre-training</th> | |
| <th>Post-training</th> | |
| <th>Tasks</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><strong>NTv3-8M</strong></td> | |
| <td>8M params</td> | |
| <td>MLM</td> | |
| <td>❌</td> | |
| <td>Embeddings, light inference</td> | |
| </tr> | |
| <tr> | |
| <td><strong>NTv3-100M</strong></td> | |
| <td>100M params</td> | |
| <td>MLM</td> | |
| <td><span class="checkmark">✅</span></td> | |
| <td>Tracks, annotation</td> | |
| </tr> | |
| <tr> | |
| <td><strong>NTv3-650M</strong></td> | |
| <td>650M params</td> | |
| <td>MLM</td> | |
| <td><span class="checkmark">✅</span></td> | |
| <td>Tracks, annotation, best accuracy</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| </div> | |
| <div class="card-stack"> | |
| <div class="card"> | |
| <h2>📓 Notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">folder</a>)</h2> | |
| <ul> | |
| <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/00_quickstart_inference.ipynb" target="_blank" rel="noopener">🚀 00 — Quickstart inference</a></li> | |
| <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/01_tracks_prediction.ipynb" target="_blank" rel="noopener">📊 01 — Tracks prediction</a></li> | |
| <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/02_genome_annotation.ipynb" target="_blank" rel="noopener">🏷️ 02 — Genome annotation / segmentation</a></li> | |
| <li>🎯 03 — Fine-tune on bigwig tracks</li> | |
| <li>🔍 04 — Model interpretation</li> | |
| <li>🧪 05 — Sequence generation</li> | |
| </ul> | |
| </div> | |
| <div class="card"> | |
| <h2>🔗 Links</h2> | |
| <ul> | |
| <li>📄 Paper: (add link)</li> | |
| <li><a href="https://github.com/instadeepai/nucleotide-transformer">💻 JAX model code (GitHub)</a></li> | |
| <li><a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">🎯 HF Model Collection (all NTv3 models)</a></li> | |
| <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">📓 All notebooks</a></li> | |
| <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3_benchmark" target="_blank" rel="noopener">🏆 NTv3 benchmark leaderboard</a></li> | |
| </ul> | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <h2>🤖 Load a pre-trained model</h2> | |
| <p>Here is an example of how to load and use a pre-trained NTv3 model.</p> | |
| <div class="code"><pre><code class="language-python">from transformers import AutoTokenizer, AutoModelForMaskedLM | |
| model_name = "InstaDeepAI/NTv3_650M_pre" | |
| # Load model and tokenizer | |
| model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True) | |
| tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| # Tokenize input sequences | |
| batch = tok(["ATCGNATCG", "ACGT"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt") | |
| # Run model | |
| out = model( | |
| **batch, | |
| output_hidden_states=True, | |
| output_attentions=True | |
| ) | |
| # Print output shapes | |
| print(out.logits.shape) # (B, L, V = 11) | |
| print(len(out.hidden_states)) # convs + transformers + deconvs | |
| print(len(out.attentions)) # equals transformer layers = 12 | |
| </code></pre></div> | |
| <p>Model embeddings can be used for fine-tuning on downstream tasks.</p> | |
| <p style="margin-top: 40px;">TO DO: add pipeline for fine-tuning on functional tracks or genome annotation.</p> | |
| </div> | |
| <div class="card"> | |
| <h2>💻 Use a post-trained model</h2> | |
| <p>Here is a quick example of how to use the post-trained NTv3 650M model to predict tracks for a human genomic window.</p> | |
| <div class="code"><pre><code class="language-python">from transformers import AutoConfig | |
| model_name = "InstaDeepAI/NTv3_650M" | |
| # Load track prediction pipeline | |
| cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True, force_download=True) | |
| pipe = cfg.load_tracks_pipeline(model_name, device="auto") # or "cpu"/"cuda"/"mps" | |
| # Run track prediction | |
| out = pipe( | |
| { | |
| "chrom": "chr19", | |
| "start": 6_700_000, | |
| "end": 6_831_072, | |
| "species": "human" | |
| } | |
| ) | |
| # Print output shapes | |
| # 7k human tracks over 37.5 % center region of the input sequence | |
| print("bigwig_tracks_logits:", tuple(out.bigwig_tracks_logits.shape)) | |
| # Location of 21 genomic elements over 37.5 % center region of the input sequence | |
| print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape)) | |
| # Language model logits for whole sequence over vocabulary | |
| print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div> | |
| <p>Predictions can also be plotted for a subset of functional tracks and genomic elements:</p> | |
| <div class="code"><pre><code class="language-python">tracks_to_plot = { | |
| "K562 RNA-seq": "ENCSR056HPM", | |
| "K562 DNAse": "ENCSR921NMD", | |
| "K562 H3k4me3": "ENCSR000DWD", | |
| "K562 CTCF": "ENCSR000AKO", | |
| "HepG2 RNA-seq": "ENCSR561FEE_P", | |
| "HepG2 DNAse": "ENCSR000EJV", | |
| "HepG2 H3k4me3": "ENCSR000AMP", | |
| "HepG2 CTCF": "ENCSR000BIE", | |
| } | |
| elements_to_plot = ["protein_coding_gene", "exon", "intron", "splice_donor", "splice_acceptor"] | |
| out = pipe( | |
| {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072, "species": "human"}, | |
| plot=True, | |
| tracks_to_plot=tracks_to_plot, | |
| elements_to_plot=elements_to_plot, | |
| )</code></pre></div> | |
| <img src="assets/output_tracks.png" alt="Output tracks visualization" style="max-width: 100%; margin-top: 20px;" /> | |
| </div> | |
| </div> | |
| <!-- <div class="paper-summary"> | |
| <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2> | |
| <img src="assets/paper_summary.png" alt="NTv3 Paper Summary" /> | |
| </div> --> | |
| <p class="footer"> | |
| © instadeep-ai — NTv3 companion Space. | |
| </p> | |
| </div> | |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-core.min.js"></script> | |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/autoloader/prism-autoloader.min.js"></script> | |
| </body> | |
| </html> | |