bernardo-de-almeida commited on
Commit
66b0297
·
1 Parent(s): cb9b9a5

feat: improve main page and notebooks

Browse files
README.md CHANGED
@@ -54,7 +54,7 @@ out = pipe("ACGT...")
54
 
55
  ## Checkpoints
56
 
57
- **Pre-trained:** `InstaDeepAI/ntv3_8M_7downsample_pretrained_le_1mb`, `InstaDeepAI/ntv3_106M_7downsample_pretrained_le_1mb`, `InstaDeepAI/ntv3_650M_7downsample_pretrained_le_1mb`
58
 
59
  **Post-trained:** `InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb`, `InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb`
60
 
 
54
 
55
  ## Checkpoints
56
 
57
+ **Pre-trained:** `InstaDeepAI/ntv3_8M_pre`, `InstaDeepAI/ntv3_100M_pre`, `InstaDeepAI/ntv3_650M_pre`
58
 
59
  **Post-trained:** `InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb`, `InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb`
60
 
index.html CHANGED
@@ -85,6 +85,24 @@
85
  font-size: inherit;
86
  color: inherit;
87
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  .footer { margin-top: 22px; color: var(--muted); font-size: 13px; }
89
  @media (max-width: 860px) {
90
  .card { grid-column: span 12; }
@@ -102,34 +120,23 @@
102
  </p>
103
 
104
  <div class="pillrow">
105
- <span class="pill">Long-context genomics</span>
106
- <span class="pill">Torch notebooks</span>
 
107
  <span class="pill">Inference • Fine-tune • Interpret • Generate</span>
 
108
  </div>
109
  </div>
110
 
111
  <div class="grid">
112
- <div class="card">
113
- <h2>Notebooks</h2>
114
- <ul>
115
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">Browse notebooks folder</a></li>
116
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/00_quickstart_inference.ipynb" target="_blank" rel="noopener">00 — Quickstart inference</a></li>
117
- <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/01_tracks_prediction.ipynb" target="_blank" rel="noopener">01 — Tracks prediction</a></li>
118
- <li>02 — Genome annotation / segmentation</li>
119
- <li>03 — Fine-tune a head</li>
120
- <li>04 — Model interpretation</li>
121
- <li>05 — Sequence generation</li>
122
- </ul>
123
- </div>
124
-
125
  <div class="card">
126
  <h2>Models</h2>
127
  <ul>
128
- <li>Pretrained checkpoints:
129
  <div style="margin-top: 8px; margin-left: 0;">
130
- <div><a href="https://huggingface.co/InstaDeepAI/ntv3_8M_7downsample_pretrained_le_1mb"><code>InstaDeepAI/ntv3_8M_7downsample_pretrained_le_1mb</code></a></div>
131
- <div><a href="https://huggingface.co/InstaDeepAI/ntv3_106M_7downsample_pretrained_le_1mb"><code>InstaDeepAI/ntv3_106M_7downsample_pretrained_le_1mb</code></a></div>
132
- <div><a href="https://huggingface.co/InstaDeepAI/ntv3_650M_7downsample_pretrained_le_1mb"><code>InstaDeepAI/ntv3_650M_7downsample_pretrained_le_1mb</code></a></div>
133
  </div>
134
  </li>
135
  <li>Post-trained checkpoints:
@@ -141,6 +148,19 @@
141
  </ul>
142
  </div>
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  <div class="card">
145
  <h2>Model usage (to update)</h2>
146
  <p>Here is a quick example of how to use NTv3 models.</p>
@@ -164,6 +184,11 @@ pipe = pipeline(
164
  </div>
165
  </div>
166
 
 
 
 
 
 
167
  <p class="footer">
168
  © instadeep-ai — NTv3 companion Space.
169
  </p>
 
85
  font-size: inherit;
86
  color: inherit;
87
  }
88
+ .paper-summary {
89
+ margin-top: 12px;
90
+ padding: 24px;
91
+ border: 1px solid var(--border);
92
+ background: var(--card);
93
+ border-radius: var(--radius);
94
+ box-shadow: var(--shadow);
95
+ }
96
+ .paper-summary h2 {
97
+ text-align: center;
98
+ margin: 0 0 20px 0;
99
+ }
100
+ .paper-summary img {
101
+ width: 100%;
102
+ height: auto;
103
+ display: block;
104
+ border-radius: 12px;
105
+ }
106
  .footer { margin-top: 22px; color: var(--muted); font-size: 13px; }
107
  @media (max-width: 860px) {
108
  .card { grid-column: span 12; }
 
120
  </p>
121
 
122
  <div class="pillrow">
123
+ <span class="pill">Foundation Models</span>
124
+ <span class="pill">Long-context genomics</span>
125
+ <span class="pill">Multi-species</span>
126
  <span class="pill">Inference • Fine-tune • Interpret • Generate</span>
127
+ <span class="pill">Torch notebooks</span>
128
  </div>
129
  </div>
130
 
131
  <div class="grid">
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  <div class="card">
133
  <h2>Models</h2>
134
  <ul>
135
+ <li>Pretrained checkpoints (see <a href="https://huggingface.co/collections/InstaDeepAI/nucleotide-transformer-v3" target="_blank" rel="noopener">collection</a>):
136
  <div style="margin-top: 8px; margin-left: 0;">
137
+ <div><a href="https://huggingface.co/InstaDeepAI/ntv3_8M_pre"><code>InstaDeepAI/ntv3_8M_pre</code></a></div>
138
+ <div><a href="https://huggingface.co/InstaDeepAI/ntv3_100M_pre"><code>InstaDeepAI/ntv3_100M_pre</code></a></div>
139
+ <div><a href="https://huggingface.co/InstaDeepAI/ntv3_650M_pre"><code>InstaDeepAI/ntv3_650M_pre</code></a></div>
140
  </div>
141
  </li>
142
  <li>Post-trained checkpoints:
 
148
  </ul>
149
  </div>
150
 
151
+ <div class="card">
152
+ <h2>Notebooks</h2>
153
+ <ul>
154
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks" target="_blank" rel="noopener">Browse notebooks folder</a></li>
155
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/00_quickstart_inference.ipynb" target="_blank" rel="noopener">00 — Quickstart inference</a></li>
156
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/01_tracks_prediction.ipynb" target="_blank" rel="noopener">01 — Tracks prediction</a></li>
157
+ <li>02 — Genome annotation / segmentation</li>
158
+ <li>03 — Fine-tune a head</li>
159
+ <li>04 — Model interpretation</li>
160
+ <li>05 — Sequence generation</li>
161
+ </ul>
162
+ </div>
163
+
164
  <div class="card">
165
  <h2>Model usage (to update)</h2>
166
  <p>Here is a quick example of how to use NTv3 models.</p>
 
184
  </div>
185
  </div>
186
 
187
+ <div class="paper-summary">
188
+ <h2>A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2>
189
+ <img src="assets/paper_summary.png" alt="NTv3 Paper Summary" />
190
+ </div>
191
+
192
  <p class="footer">
193
  © instadeep-ai — NTv3 companion Space.
194
  </p>
notebooks/00_quickstart_inference.ipynb CHANGED
@@ -7,9 +7,9 @@
7
  "source": [
8
  "# NTv3 Quickstart — Pre-trained and Post-trained models\n",
9
  "\n",
10
- "This notebook demonstrates how to run **quick inference** with bothe pre- and post-trained NTv3 checkpoints:\n",
11
  "\n",
12
- "- **Pre-trained (MLM-focused):** `InstaDeepAI/ntv3_8M_7downsample_pretrained_le_1mb`, `InstaDeepAI/ntv3_106M_7downsample_pretrained_le_1mb`, `InstaDeepAI/ntv3_650M_ntv3_650M_7downsample_pretrained_le_1mb7downsample_pre_trained_1mb`\n",
13
  "- **Post-trained (task heads):** `InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb`, `InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb`\n",
14
  "\n",
15
  "We show how to:\n",
@@ -103,32 +103,162 @@
103
  },
104
  {
105
  "cell_type": "code",
106
- "execution_count": null,
107
  "id": "336bb40c",
108
  "metadata": {},
109
  "outputs": [
110
  {
111
- "name": "stdout",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  "output_type": "stream",
113
  "text": [
114
- "torch.Size([2, 128, 11])\n",
115
- "16\n",
116
- "2\n",
117
- "MLM logits shape: (2, 128, 11)\n"
118
  ]
119
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  {
121
  "name": "stderr",
122
  "output_type": "stream",
123
  "text": [
124
- "/opt/anaconda3/envs/hf-finetune/lib/python3.10/site-packages/torch/amp/autocast_mode.py:283: UserWarning: In CPU autocast, but the target dtype is not supported. Disabling autocast.\n",
125
- "CPU Autocast only supports dtype of torch.bfloat16, torch.float16 currently.\n",
126
- " warnings.warn(error_message)\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ]
128
  }
129
  ],
130
  "source": [
131
- "pretrained_model_name = \"InstaDeepAI/ntv3_8M_7downsample_pretrained_le_1mb\"\n",
132
  "\n",
133
  "# Load tokenizer/model\n",
134
  "tok_pre = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
 
7
  "source": [
8
  "# NTv3 Quickstart — Pre-trained and Post-trained models\n",
9
  "\n",
10
+ "This notebook demonstrates how to run **quick inference** with both the pre- and post-trained NTv3 checkpoints:\n",
11
  "\n",
12
+ "- **Pre-trained (MLM-focused):** `InstaDeepAI/ntv3_8M_pre`, `InstaDeepAI/ntv3_100M_pre`, `InstaDeepAI/ntv3_650M_pre`\n",
13
  "- **Post-trained (task heads):** `InstaDeepAI/ntv3_106M_7downsample_post_trained_1mb`, `InstaDeepAI/ntv3_650M_7downsample_post_trained_1mb`\n",
14
  "\n",
15
  "We show how to:\n",
 
103
  },
104
  {
105
  "cell_type": "code",
106
+ "execution_count": 14,
107
  "id": "336bb40c",
108
  "metadata": {},
109
  "outputs": [
110
  {
111
+ "data": {
112
+ "application/vnd.jupyter.widget-view+json": {
113
+ "model_id": "411ee47e94ae467f9685c35b65e3e52d",
114
+ "version_major": 2,
115
+ "version_minor": 0
116
+ },
117
+ "text/plain": [
118
+ "tokenizer_config.json: 0%| | 0.00/1.48k [00:00<?, ?B/s]"
119
+ ]
120
+ },
121
+ "metadata": {},
122
+ "output_type": "display_data"
123
+ },
124
+ {
125
+ "data": {
126
+ "application/vnd.jupyter.widget-view+json": {
127
+ "model_id": "30447edb44b849bd936290f3a6b1b863",
128
+ "version_major": 2,
129
+ "version_minor": 0
130
+ },
131
+ "text/plain": [
132
+ "tokenization_ntv3.py: 0%| | 0.00/12.0k [00:00<?, ?B/s]"
133
+ ]
134
+ },
135
+ "metadata": {},
136
+ "output_type": "display_data"
137
+ },
138
+ {
139
+ "name": "stderr",
140
  "output_type": "stream",
141
  "text": [
142
+ "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/ntv3_base_model:\n",
143
+ "- tokenization_ntv3.py\n",
144
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
 
145
  ]
146
  },
147
+ {
148
+ "data": {
149
+ "application/vnd.jupyter.widget-view+json": {
150
+ "model_id": "766f183dcc84421588e5cf0241d3efe7",
151
+ "version_major": 2,
152
+ "version_minor": 0
153
+ },
154
+ "text/plain": [
155
+ "vocab.json: 0%| | 0.00/138 [00:00<?, ?B/s]"
156
+ ]
157
+ },
158
+ "metadata": {},
159
+ "output_type": "display_data"
160
+ },
161
+ {
162
+ "data": {
163
+ "application/vnd.jupyter.widget-view+json": {
164
+ "model_id": "b0db83f7cb824d3288a30bebf7891a63",
165
+ "version_major": 2,
166
+ "version_minor": 0
167
+ },
168
+ "text/plain": [
169
+ "special_tokens_map.json: 0%| | 0.00/149 [00:00<?, ?B/s]"
170
+ ]
171
+ },
172
+ "metadata": {},
173
+ "output_type": "display_data"
174
+ },
175
+ {
176
+ "data": {
177
+ "application/vnd.jupyter.widget-view+json": {
178
+ "model_id": "33cf5391dcc549f088e4e927651d1cdb",
179
+ "version_major": 2,
180
+ "version_minor": 0
181
+ },
182
+ "text/plain": [
183
+ "config.json: 0%| | 0.00/1.70k [00:00<?, ?B/s]"
184
+ ]
185
+ },
186
+ "metadata": {},
187
+ "output_type": "display_data"
188
+ },
189
+ {
190
+ "data": {
191
+ "application/vnd.jupyter.widget-view+json": {
192
+ "model_id": "85772d5369234ca286cfa518e1725b12",
193
+ "version_major": 2,
194
+ "version_minor": 0
195
+ },
196
+ "text/plain": [
197
+ "configuration_ntv3.py: 0%| | 0.00/5.90k [00:00<?, ?B/s]"
198
+ ]
199
+ },
200
+ "metadata": {},
201
+ "output_type": "display_data"
202
+ },
203
  {
204
  "name": "stderr",
205
  "output_type": "stream",
206
  "text": [
207
+ "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/ntv3_base_model:\n",
208
+ "- configuration_ntv3.py\n",
209
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
210
+ ]
211
+ },
212
+ {
213
+ "data": {
214
+ "application/vnd.jupyter.widget-view+json": {
215
+ "model_id": "ec1153d073e444c5b255ee5adea6ba68",
216
+ "version_major": 2,
217
+ "version_minor": 0
218
+ },
219
+ "text/plain": [
220
+ "modeling_ntv3_base.py: 0%| | 0.00/33.9k [00:00<?, ?B/s]"
221
+ ]
222
+ },
223
+ "metadata": {},
224
+ "output_type": "display_data"
225
+ },
226
+ {
227
+ "name": "stderr",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/ntv3_base_model:\n",
231
+ "- modeling_ntv3_base.py\n",
232
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
233
+ ]
234
+ },
235
+ {
236
+ "data": {
237
+ "application/vnd.jupyter.widget-view+json": {
238
+ "model_id": "94b9bb7fe0da4f4994adb9127d9af7e6",
239
+ "version_major": 2,
240
+ "version_minor": 0
241
+ },
242
+ "text/plain": [
243
+ "model.safetensors: 0%| | 0.00/30.8M [00:00<?, ?B/s]"
244
+ ]
245
+ },
246
+ "metadata": {},
247
+ "output_type": "display_data"
248
+ },
249
+ {
250
+ "name": "stdout",
251
+ "output_type": "stream",
252
+ "text": [
253
+ "torch.Size([2, 128, 11])\n",
254
+ "16\n",
255
+ "2\n",
256
+ "MLM logits shape: (2, 128, 11)\n"
257
  ]
258
  }
259
  ],
260
  "source": [
261
+ "pretrained_model_name = \"InstaDeepAI/ntv3_8M_pre\"\n",
262
  "\n",
263
  "# Load tokenizer/model\n",
264
  "tok_pre = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
notebooks/01_tracks_prediction.ipynb CHANGED
The diff for this file is too large to render. See raw diff