bernardo-de-almeida commited on
Commit
df00fce
·
1 Parent(s): a3047bf

fix model links

Browse files
README.md CHANGED
@@ -18,7 +18,7 @@ Notebooks live in `./notebooks/`:
18
  - `00_quickstart_inference.ipynb` — load a checkpoint + run inference
19
  - `01_tracks_prediction.ipynb` — sequence → functional tracks (+ plotting)
20
  - `02_genome_annotation_segmentation.ipynb` — sequence → annotation
21
- - `03_finetune_head.ipynb` — fine-tune on a bigwig track
22
  - `04_model_interpretation.ipynb` — interpretation of post-trained model
23
  - `05_sequence_generation.ipynb` — fine-tune NTv3 to generate enhancer sequences
24
 
@@ -43,7 +43,7 @@ import torch
43
 
44
  pipe = pipeline(
45
  task="ntv3-tracks",
46
- model="InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb",
47
  trust_remote_code=True,
48
  device="cuda",
49
  torch_dtype=torch.bfloat16,
@@ -54,9 +54,9 @@ out = pipe("ACGT...")
54
 
55
  ## Checkpoints
56
 
57
- **Pre-trained:** `InstaDeepAI/ntv3_8M_pre`, `InstaDeepAI/ntv3_100M_pre`, `InstaDeepAI/ntv3_650M_pre`
58
 
59
- **Post-trained:** `InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb`, `InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb`
60
 
61
  ## Links
62
 
 
18
  - `00_quickstart_inference.ipynb` — load a checkpoint + run inference
19
  - `01_tracks_prediction.ipynb` — sequence → functional tracks (+ plotting)
20
  - `02_genome_annotation_segmentation.ipynb` — sequence → annotation
21
+ - `03_finetune_head.ipynb` — fine-tune on bigwig tracks
22
  - `04_model_interpretation.ipynb` — interpretation of post-trained model
23
  - `05_sequence_generation.ipynb` — fine-tune NTv3 to generate enhancer sequences
24
 
 
43
 
44
  pipe = pipeline(
45
  task="ntv3-tracks",
46
+ model="InstaDeepAI/NTv3_650M",
47
  trust_remote_code=True,
48
  device="cuda",
49
  torch_dtype=torch.bfloat16,
 
54
 
55
  ## Checkpoints
56
 
57
+ **Pre-trained:** `InstaDeepAI/NTv3_8M_pre`, `InstaDeepAI/NTv3_100M_pre`, `InstaDeepAI/NTv3_650M_pre`
58
 
59
+ **Post-trained:** `InstaDeepAI/NTv3_100M`, `InstaDeepAI/NTv3_650M`
60
 
61
  ## Links
62
 
index.html CHANGED
@@ -134,9 +134,9 @@
134
  <ul>
135
  <li>Pretrained checkpoints (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>):
136
  <div style="margin-top: 8px; margin-left: 0;">
137
- <div><a href="https://huggingface.co/InstaDeepAI/ntv3_8M_pre"><code>InstaDeepAI/ntv3_8M_pre</code></a></div>
138
- <div><a href="https://huggingface.co/InstaDeepAI/ntv3_100M_pre"><code>InstaDeepAI/ntv3_100M_pre</code></a></div>
139
- <div><a href="https://huggingface.co/InstaDeepAI/ntv3_650M_pre"><code>InstaDeepAI/ntv3_650M_pre</code></a></div>
140
  </div>
141
  </li>
142
  <li>Post-trained checkpoints:
@@ -179,7 +179,8 @@ pipe = pipeline(
179
  <h2>Links</h2>
180
  <ul>
181
  <li>Paper: (add link)</li>
182
- <li><a href="https://github.com/instadeepai/nucleotide-transformer">JAX training code</a></li>
 
183
  </ul>
184
  </div>
185
  </div>
 
134
  <ul>
135
  <li>Pretrained checkpoints (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>):
136
  <div style="margin-top: 8px; margin-left: 0;">
137
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_8M_pre"><code>InstaDeepAI/NTv3_8M_pre</code></a></div>
138
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_100M_pre"><code>InstaDeepAI/NTv3_100M_pre</code></a></div>
139
+ <div><a href="https://huggingface.co/InstaDeepAI/NTv3_650M_pre"><code>InstaDeepAI/NTv3_650M_pre</code></a></div>
140
  </div>
141
  </li>
142
  <li>Post-trained checkpoints:
 
179
  <h2>Links</h2>
180
  <ul>
181
  <li>Paper: (add link)</li>
182
+ <li><a href="https://github.com/instadeepai/nucleotide-transformer">JAX model code (GitHub)</a></li>
183
+ <li>NTv3 benchmark leaderboard: (add link)</li>
184
  </ul>
185
  </div>
186
  </div>
notebooks/00_quickstart_inference.ipynb CHANGED
@@ -9,8 +9,8 @@
9
  "\n",
10
  "This notebook demonstrates how to run **quick inference** with both the pre- and post-trained NTv3 checkpoints:\n",
11
  "\n",
12
- "- **Pre-trained (MLM-focused):** `InstaDeepAI/ntv3_8M_pre`, `InstaDeepAI/ntv3_100M_pre`, `InstaDeepAI/ntv3_650M_pre`\n",
13
- "- **Post-trained (task heads):** `InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb`, `InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb`\n",
14
  "\n",
15
  "We show how to:\n",
16
  "\n",
@@ -105,7 +105,7 @@
105
  },
106
  {
107
  "cell_type": "code",
108
- "execution_count": 14,
109
  "id": "336bb40c",
110
  "metadata": {},
111
  "outputs": [
@@ -260,7 +260,7 @@
260
  }
261
  ],
262
  "source": [
263
- "pretrained_model_name = \"InstaDeepAI/ntv3_8M_pre\"\n",
264
  "\n",
265
  "# Load tokenizer/model\n",
266
  "tok_pre = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
@@ -318,7 +318,7 @@
318
  }
319
  ],
320
  "source": [
321
- "posttrained_model_name = \"InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb\"\n",
322
  "\n",
323
  "# Load config/tokenizers/model\n",
324
  "cfg_pos = AutoConfig.from_pretrained(posttrained_model_name, trust_remote_code=True)\n",
@@ -345,10 +345,12 @@
345
  " output_attentions=True,\n",
346
  ")\n",
347
  "\n",
348
- "# Access model outputs\n",
349
- "print(out[\"bigwig_tracks_logits\"].shape) # per-assembly functional track predictions\n",
350
- "print(out[\"bed_tracks_logits\"].shape) # genomic element classifications\n",
351
- "print(out[\"logits\"].shape) # masked LM logits"
 
 
352
  ]
353
  }
354
  ],
 
9
  "\n",
10
  "This notebook demonstrates how to run **quick inference** with both the pre- and post-trained NTv3 checkpoints:\n",
11
  "\n",
12
+ "- **Pre-trained (MLM-focused):** `InstaDeepAI/NTv3_8M_pre`, `InstaDeepAI/NTv3_100M_pre`, `InstaDeepAI/NTv3_650M_pre`\n",
13
+ "- **Post-trained (task heads):** `InstaDeepAI/NTv3_100M`, `InstaDeepAI/NTv3_650M`\n",
14
  "\n",
15
  "We show how to:\n",
16
  "\n",
 
105
  },
106
  {
107
  "cell_type": "code",
108
+ "execution_count": null,
109
  "id": "336bb40c",
110
  "metadata": {},
111
  "outputs": [
 
260
  }
261
  ],
262
  "source": [
263
+ "pretrained_model_name = \"InstaDeepAI/NTv3_8M_pre\"\n",
264
  "\n",
265
  "# Load tokenizer/model\n",
266
  "tok_pre = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
 
318
  }
319
  ],
320
  "source": [
321
+ "posttrained_model_name = \"InstaDeepAI/NTv3_100M\"\n",
322
  "\n",
323
  "# Load config/tokenizers/model\n",
324
  "cfg_pos = AutoConfig.from_pretrained(posttrained_model_name, trust_remote_code=True)\n",
 
345
  " output_attentions=True,\n",
346
  ")\n",
347
  "\n",
348
+ "# 7k human tracks over 37.5 % center region of the input sequence\n",
349
+ "print(\"bigwig_tracks_logits:\", out[\"bigwig_tracks_logits\"].shape)\n",
350
+ "# Location of 21 genomic elements over 37.5 % center region of the input sequence\n",
351
+ "print(\"bed_tracks_logits:\", out[\"bed_tracks_logits\"].shape)\n",
352
+ "# Language model logits for whole sequence over vocabulary\n",
353
+ "print(\"language model logits:\", out[\"logits\"].shape)"
354
  ]
355
  }
356
  ],
notebooks/01_tracks_prediction.ipynb CHANGED
@@ -112,7 +112,7 @@
112
  "# -----------------------------\n",
113
  "# User inputs\n",
114
  "# -----------------------------\n",
115
- "model_name = \"InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb\" # options: \"InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb\" or \"InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb_v2\"\n",
116
  "\n",
117
  "# Example window from a given species (edit these) - needs to be multiple of 128 due to the model downsampling\n",
118
  "assembly = \"hg38\"\n",
 
112
  "# -----------------------------\n",
113
  "# User inputs\n",
114
  "# -----------------------------\n",
115
+ "model_name = \"InstaDeepAI/NTv3_100M\" # options: \"InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb\" or \"InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb_v2\"\n",
116
  "\n",
117
  "# Example window from a given species (edit these) - needs to be multiple of 128 due to the model downsampling\n",
118
  "assembly = \"hg38\"\n",