bernardo-de-almeida commited on
Commit
421ebe7
·
1 Parent(s): f44f397

fix: main page links

Browse files
notebooks_tutorials/01_tracks_prediction.ipynb CHANGED
@@ -108,7 +108,233 @@
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  "id": "795a576f",
113
  "metadata": {},
114
  "outputs": [],
@@ -122,9 +348,9 @@
122
  "species = \"human\" # will use for condition the model on species\n",
123
  "assembly = \"hg38\" # will use for fetching the chromosome sequence\n",
124
  "chrom = \"chr19\"\n",
125
- "start = 6_700_000\n",
126
- "end = 6_765_536\n",
127
- "# Limiting to 65kb to work on Google Colab T4 GPU -> increase up to 1 million nucleotides if you have a better GPU\n",
128
  "\n",
129
  "# Optional\n",
130
  "HF_TOKEN = os.getenv(\"HF_TOKEN\", None)"
 
108
  },
109
  {
110
  "cell_type": "code",
111
+ "execution_count": null,
112
+ "id": "6193fd37",
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "data": {
117
+ "text/html": [
118
+ "<div>\n",
119
+ "<style scoped>\n",
120
+ " .dataframe tbody tr th:only-of-type {\n",
121
+ " vertical-align: middle;\n",
122
+ " }\n",
123
+ "\n",
124
+ " .dataframe tbody tr th {\n",
125
+ " vertical-align: top;\n",
126
+ " }\n",
127
+ "\n",
128
+ " .dataframe thead th {\n",
129
+ " text-align: right;\n",
130
+ " }\n",
131
+ "</style>\n",
132
+ "<table border=\"1\" class=\"dataframe\">\n",
133
+ " <thead>\n",
134
+ " <tr style=\"text-align: right;\">\n",
135
+ " <th></th>\n",
136
+ " <th>file_id</th>\n",
137
+ " <th>biosample_type</th>\n",
138
+ " <th>tissue</th>\n",
139
+ " <th>assay</th>\n",
140
+ " <th>strand</th>\n",
141
+ " <th>experiment_target</th>\n",
142
+ " <th>specie</th>\n",
143
+ " <th>dataset</th>\n",
144
+ " </tr>\n",
145
+ " </thead>\n",
146
+ " <tbody>\n",
147
+ " <tr>\n",
148
+ " <th>0</th>\n",
149
+ " <td>SRX20249461</td>\n",
150
+ " <td>tissue</td>\n",
151
+ " <td>Leaf (17 days)</td>\n",
152
+ " <td>TF ChIP-seq</td>\n",
153
+ " <td>NaN</td>\n",
154
+ " <td>JMJ20</td>\n",
155
+ " <td>glycine_max</td>\n",
156
+ " <td>ncbi_chrom_acc</td>\n",
157
+ " </tr>\n",
158
+ " <tr>\n",
159
+ " <th>1</th>\n",
160
+ " <td>SRX20249462</td>\n",
161
+ " <td>tissue</td>\n",
162
+ " <td>Leaf (17 days)</td>\n",
163
+ " <td>TF ChIP-seq</td>\n",
164
+ " <td>NaN</td>\n",
165
+ " <td>FLAG</td>\n",
166
+ " <td>glycine_max</td>\n",
167
+ " <td>ncbi_chrom_acc</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>2</th>\n",
171
+ " <td>SRX21859141</td>\n",
172
+ " <td>tissue</td>\n",
173
+ " <td>Seed (60 days)</td>\n",
174
+ " <td>Histone ChIP-seq</td>\n",
175
+ " <td>NaN</td>\n",
176
+ " <td>H3K27me3</td>\n",
177
+ " <td>glycine_max</td>\n",
178
+ " <td>ncbi_chrom_acc</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>3</th>\n",
182
+ " <td>SRX21859142</td>\n",
183
+ " <td>tissue</td>\n",
184
+ " <td>Seed (60 days)</td>\n",
185
+ " <td>Histone ChIP-seq</td>\n",
186
+ " <td>NaN</td>\n",
187
+ " <td>H3K27me3</td>\n",
188
+ " <td>glycine_max</td>\n",
189
+ " <td>ncbi_chrom_acc</td>\n",
190
+ " </tr>\n",
191
+ " <tr>\n",
192
+ " <th>4</th>\n",
193
+ " <td>SRX21859143</td>\n",
194
+ " <td>tissue</td>\n",
195
+ " <td>Seed (60 days)</td>\n",
196
+ " <td>Histone ChIP-seq</td>\n",
197
+ " <td>NaN</td>\n",
198
+ " <td>H3K4me3</td>\n",
199
+ " <td>glycine_max</td>\n",
200
+ " <td>ncbi_chrom_acc</td>\n",
201
+ " </tr>\n",
202
+ " <tr>\n",
203
+ " <th>...</th>\n",
204
+ " <td>...</td>\n",
205
+ " <td>...</td>\n",
206
+ " <td>...</td>\n",
207
+ " <td>...</td>\n",
208
+ " <td>...</td>\n",
209
+ " <td>...</td>\n",
210
+ " <td>...</td>\n",
211
+ " <td>...</td>\n",
212
+ " </tr>\n",
213
+ " <tr>\n",
214
+ " <th>15884</th>\n",
215
+ " <td>GSM874952</td>\n",
216
+ " <td>Unknown</td>\n",
217
+ " <td>NaN</td>\n",
218
+ " <td>TF ChIP-seq</td>\n",
219
+ " <td>NaN</td>\n",
220
+ " <td>RPB2</td>\n",
221
+ " <td>mouse</td>\n",
222
+ " <td>geo</td>\n",
223
+ " </tr>\n",
224
+ " <tr>\n",
225
+ " <th>15885</th>\n",
226
+ " <td>GSM874953</td>\n",
227
+ " <td>Unknown</td>\n",
228
+ " <td>NaN</td>\n",
229
+ " <td>TF ChIP-seq</td>\n",
230
+ " <td>NaN</td>\n",
231
+ " <td>RPB2</td>\n",
232
+ " <td>mouse</td>\n",
233
+ " <td>geo</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>15886</th>\n",
237
+ " <td>GSM874954</td>\n",
238
+ " <td>Unknown</td>\n",
239
+ " <td>NaN</td>\n",
240
+ " <td>TF ChIP-seq</td>\n",
241
+ " <td>NaN</td>\n",
242
+ " <td>RPB2</td>\n",
243
+ " <td>mouse</td>\n",
244
+ " <td>geo</td>\n",
245
+ " </tr>\n",
246
+ " <tr>\n",
247
+ " <th>15887</th>\n",
248
+ " <td>GSM874955</td>\n",
249
+ " <td>Unknown</td>\n",
250
+ " <td>NaN</td>\n",
251
+ " <td>TF ChIP-seq</td>\n",
252
+ " <td>NaN</td>\n",
253
+ " <td>RPB2</td>\n",
254
+ " <td>mouse</td>\n",
255
+ " <td>geo</td>\n",
256
+ " </tr>\n",
257
+ " <tr>\n",
258
+ " <th>15888</th>\n",
259
+ " <td>GSM874956</td>\n",
260
+ " <td>Unknown</td>\n",
261
+ " <td>NaN</td>\n",
262
+ " <td>TF ChIP-seq</td>\n",
263
+ " <td>NaN</td>\n",
264
+ " <td>RPB2</td>\n",
265
+ " <td>mouse</td>\n",
266
+ " <td>geo</td>\n",
267
+ " </tr>\n",
268
+ " </tbody>\n",
269
+ "</table>\n",
270
+ "<p>15889 rows × 8 columns</p>\n",
271
+ "</div>"
272
+ ],
273
+ "text/plain": [
274
+ " file_id biosample_type tissue assay strand \\\n",
275
+ "0 SRX20249461 tissue Leaf (17 days) TF ChIP-seq NaN \n",
276
+ "1 SRX20249462 tissue Leaf (17 days) TF ChIP-seq NaN \n",
277
+ "2 SRX21859141 tissue Seed (60 days) Histone ChIP-seq NaN \n",
278
+ "3 SRX21859142 tissue Seed (60 days) Histone ChIP-seq NaN \n",
279
+ "4 SRX21859143 tissue Seed (60 days) Histone ChIP-seq NaN \n",
280
+ "... ... ... ... ... ... \n",
281
+ "15884 GSM874952 Unknown NaN TF ChIP-seq NaN \n",
282
+ "15885 GSM874953 Unknown NaN TF ChIP-seq NaN \n",
283
+ "15886 GSM874954 Unknown NaN TF ChIP-seq NaN \n",
284
+ "15887 GSM874955 Unknown NaN TF ChIP-seq NaN \n",
285
+ "15888 GSM874956 Unknown NaN TF ChIP-seq NaN \n",
286
+ "\n",
287
+ " experiment_target specie dataset \n",
288
+ "0 JMJ20 glycine_max ncbi_chrom_acc \n",
289
+ "1 FLAG glycine_max ncbi_chrom_acc \n",
290
+ "2 H3K27me3 glycine_max ncbi_chrom_acc \n",
291
+ "3 H3K27me3 glycine_max ncbi_chrom_acc \n",
292
+ "4 H3K4me3 glycine_max ncbi_chrom_acc \n",
293
+ "... ... ... ... \n",
294
+ "15884 RPB2 mouse geo \n",
295
+ "15885 RPB2 mouse geo \n",
296
+ "15886 RPB2 mouse geo \n",
297
+ "15887 RPB2 mouse geo \n",
298
+ "15888 RPB2 mouse geo \n",
299
+ "\n",
300
+ "[15889 rows x 8 columns]"
301
+ ]
302
+ },
303
+ "execution_count": 15,
304
+ "metadata": {},
305
+ "output_type": "execute_result"
306
+ }
307
+ ],
308
+ "source": [
309
+ "import pandas as pd\n",
310
+ "\n",
311
+ "df = pd.read_csv(\"/Users/b.dealmeida/Downloads/Supplementary_tables - Post-training functional tracks.tsv\", sep=\"\\t\")"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": 17,
317
+ "id": "5f686ba9",
318
+ "metadata": {},
319
+ "outputs": [
320
+ {
321
+ "data": {
322
+ "text/plain": [
323
+ "2765"
324
+ ]
325
+ },
326
+ "execution_count": 17,
327
+ "metadata": {},
328
+ "output_type": "execute_result"
329
+ }
330
+ ],
331
+ "source": [
332
+ "len(df.tissue.unique())"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": null,
338
  "id": "795a576f",
339
  "metadata": {},
340
  "outputs": [],
 
348
  "species = \"human\" # will use for condition the model on species\n",
349
  "assembly = \"hg38\" # will use for fetching the chromosome sequence\n",
350
  "chrom = \"chr19\"\n",
351
+ "start = 6_749_152\n",
352
+ "end = 6_781_920\n",
353
+ "# Using center 32kb window (32,768 bp) for faster inference\n",
354
  "\n",
355
  "# Optional\n",
356
  "HF_TOKEN = os.getenv(\"HF_TOKEN\", None)"
tabs/home.html CHANGED
@@ -49,30 +49,30 @@
49
  <th>Size</th>
50
  <th>Pre-training</th>
51
  <th>Post-training</th>
52
- <th>Tasks</th>
53
  </tr>
54
  </thead>
55
  <tbody>
56
  <tr>
57
  <td><strong>NTv3-8M</strong></td>
58
  <td>8M params</td>
59
- <td>MLM</td>
60
  <td>❌</td>
61
  <td>Embeddings, light inference</td>
62
  </tr>
63
  <tr>
64
  <td><strong>NTv3-100M</strong></td>
65
  <td>100M params</td>
66
- <td>MLM</td>
67
  <td><span class="checkmark">✅</span></td>
68
- <td>Tracks, annotation</td>
 
69
  </tr>
70
  <tr>
71
  <td><strong>NTv3-650M</strong></td>
72
  <td>650M params</td>
73
- <td>MLM</td>
74
  <td><span class="checkmark">✅</span></td>
75
- <td>Tracks, annotation, best accuracy</td>
 
76
  </tr>
77
  </tbody>
78
  </table>
@@ -86,7 +86,8 @@
86
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb" target="_blank" rel="noopener noreferrer">📊 01 — Tracks prediction</a></li>
87
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/02_fine_tuning.ipynb" target="_blank" rel="noopener noreferrer">🎯 02 — Fine-tune on bigwig tracks</a></li>
88
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/03_model_interpretation.ipynb" target="_blank" rel="noopener noreferrer">🔍 03 — Model interpretation</a></li>
89
- <li>🧪 04 — Training NTv3 generative </li>
 
90
  </ul>
91
  </div>
92
  <div class="card">
@@ -95,7 +96,7 @@
95
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/01_functional_track_prediction.ipynb" target="_blank" rel="noopener noreferrer">🎯 01 — Generate bigwig predictions for certain tracks</a></li>
96
  <li>🎯 02 — Fine-tune on bigwig tracks</li>
97
  <li>🔍 03 — Interpret a given genomic region</li>
98
- <li>🧪 04 — Sequence generation</li>
99
  </ul>
100
  </div>
101
  <div class="card">
 
49
  <th>Size</th>
50
  <th>Pre-training</th>
51
  <th>Post-training</th>
52
+ <th>Usage</th>
53
  </tr>
54
  </thead>
55
  <tbody>
56
  <tr>
57
  <td><strong>NTv3-8M</strong></td>
58
  <td>8M params</td>
59
+ <td><span class="checkmark">✅</span></td>
60
  <td>❌</td>
61
  <td>Embeddings, light inference</td>
62
  </tr>
63
  <tr>
64
  <td><strong>NTv3-100M</strong></td>
65
  <td>100M params</td>
 
66
  <td><span class="checkmark">✅</span></td>
67
+ <td><span class="checkmark">✅</span></td>
68
+ <td>Embeddings, Tracks, annotation</td>
69
  </tr>
70
  <tr>
71
  <td><strong>NTv3-650M</strong></td>
72
  <td>650M params</td>
 
73
  <td><span class="checkmark">✅</span></td>
74
+ <td><span class="checkmark">✅</span></td>
75
+ <td>Embeddings, Tracks, annotation, best accuracy</td>
76
  </tr>
77
  </tbody>
78
  </table>
 
86
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb" target="_blank" rel="noopener noreferrer">📊 01 — Tracks prediction</a></li>
87
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/02_fine_tuning.ipynb" target="_blank" rel="noopener noreferrer">🎯 02 — Fine-tune on bigwig tracks</a></li>
88
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/03_model_interpretation.ipynb" target="_blank" rel="noopener noreferrer">🔍 03 — Model interpretation</a></li>
89
+ <li>🧪 04 — Training NTv3-generative <em>(coming soon)</em></li>
90
+ <li>🧪 04 — Generating enhancer sequences <em>(coming soon)</em></li>
91
  </ul>
92
  </div>
93
  <div class="card">
 
96
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/01_functional_track_prediction.ipynb" target="_blank" rel="noopener noreferrer">🎯 01 — Generate bigwig predictions for certain tracks</a></li>
97
  <li>🎯 02 — Fine-tune on bigwig tracks</li>
98
  <li>🔍 03 — Interpret a given genomic region</li>
99
+ <li>🧪 04 — Sequence generation <em>(coming soon)</em></li>
100
  </ul>
101
  </div>
102
  <div class="card">