bernardo-de-almeida commited on
Commit
d297f70
·
1 Parent(s): 2dae583

small fixes and add interpretability pipeline notebook

Browse files
notebooks_pipelines/01_functional_track_prediction.ipynb CHANGED
@@ -297,7 +297,7 @@
297
  },
298
  {
299
  "cell_type": "code",
300
- "execution_count": null,
301
  "id": "84f013f6",
302
  "metadata": {},
303
  "outputs": [
@@ -340,14 +340,14 @@
340
  },
341
  {
342
  "cell_type": "code",
343
- "execution_count": null,
344
  "id": "0e373749",
345
  "metadata": {},
346
  "outputs": [
347
  {
348
  "data": {
349
  "text/html": [
350
- "<div id=\"jb_4813434_buttons\"></div><div id=\"jb_4813434_igvcontainer\"></div>"
351
  ],
352
  "text/plain": [
353
  "<IPython.core.display.HTML object>"
@@ -358,7 +358,7 @@
358
  },
359
  {
360
  "data": {
361
- "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_4813434\", \"command\": \"createBrowser\", \"data\": {\"genome\": \"hg38\", \"locus\": \"chr19:6740960-6790112\", \"tracks\": [{\"name\": \"K562 RNA-seq\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_RNA_seq.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"K562 DNAse\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_DNAse.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"K562 H3k4me3\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_H3k4me3.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"K562 CTCF\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_CTCF.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 RNA-seq\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_RNA_seq.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 DNAse\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_DNAse.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 H3k4me3\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_H3k4me3.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 CTCF\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_CTCF.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}], \"id\": \"jb_4813434\"}})",
362
  "text/plain": [
363
  "<IPython.core.display.Javascript object>"
364
  ]
@@ -369,10 +369,10 @@
369
  {
370
  "data": {
371
  "text/plain": [
372
- "<igv_notebook.browser.Browser at 0x33ddd15a0>"
373
  ]
374
  },
375
- "execution_count": 39,
376
  "metadata": {},
377
  "output_type": "execute_result"
378
  }
 
297
  },
298
  {
299
  "cell_type": "code",
300
+ "execution_count": 7,
301
  "id": "84f013f6",
302
  "metadata": {},
303
  "outputs": [
 
340
  },
341
  {
342
  "cell_type": "code",
343
+ "execution_count": 8,
344
  "id": "0e373749",
345
  "metadata": {},
346
  "outputs": [
347
  {
348
  "data": {
349
  "text/html": [
350
+ "<div id=\"jb_8720993_buttons\"></div><div id=\"jb_8720993_igvcontainer\"></div>"
351
  ],
352
  "text/plain": [
353
  "<IPython.core.display.HTML object>"
 
358
  },
359
  {
360
  "data": {
361
+ "application/javascript": "window.igv.MessageHandler.on({\"id\": \"jb_8720993\", \"command\": \"createBrowser\", \"data\": {\"genome\": \"hg38\", \"locus\": \"chr19:6740960-6790112\", \"tracks\": [{\"name\": \"K562 RNA-seq\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_RNA_seq.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"K562 DNAse\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_DNAse.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"K562 H3k4me3\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_H3k4me3.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"K562 CTCF\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/K562_CTCF.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 RNA-seq\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_RNA_seq.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 DNAse\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_DNAse.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 H3k4me3\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_H3k4me3.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}, {\"name\": \"HepG2 CTCF\", \"format\": \"bigwig\", \"url\": \"bigwig_outputs/HepG2_CTCF.bw\", \"height\": 70, \"autoscale\": true, \"displayMode\": \"EXPANDED\"}], \"id\": \"jb_8720993\"}})",
362
  "text/plain": [
363
  "<IPython.core.display.Javascript object>"
364
  ]
 
369
  {
370
  "data": {
371
  "text/plain": [
372
+ "<igv_notebook.browser.Browser at 0x33bd035b0>"
373
  ]
374
  },
375
+ "execution_count": 8,
376
  "metadata": {},
377
  "output_type": "execute_result"
378
  }
notebooks_pipelines/02_functional_interpretation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks_tutorials/00_quickstart_inference.ipynb CHANGED
@@ -80,6 +80,30 @@
80
  "print(\"torch_dtype:\", torch_dtype)"
81
  ]
82
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  {
84
  "cell_type": "markdown",
85
  "id": "82146876",
@@ -95,7 +119,7 @@
95
  },
96
  {
97
  "cell_type": "code",
98
- "execution_count": 3,
99
  "id": "336bb40c",
100
  "metadata": {},
101
  "outputs": [
@@ -103,8 +127,7 @@
103
  "name": "stdout",
104
  "output_type": "stream",
105
  "text": [
106
- "torch.Size([2, 128, 11])\n",
107
- "MLM logits shape: (2, 128, 11)\n"
108
  ]
109
  }
110
  ],
@@ -115,13 +138,11 @@
115
  "tok_pre = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
116
  "model_pre = AutoModelForMaskedLM.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
117
  "\n",
118
- "# Example: human sequence\n",
119
- "seqs = [\"ATCGNATCG\", \"ACGT\"]\n",
120
  "batch = tok_pre(seqs, add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
121
  "out = model_pre(**batch)\n",
122
  "\n",
123
- "print(out.logits.shape) # (B, L, V = 11)\n",
124
- "\n",
125
  "# Access MLM logits\n",
126
  "mlm_logits = out[\"logits\"]\n",
127
  "print(\"MLM logits shape:\", tuple(mlm_logits.shape))"
@@ -144,7 +165,7 @@
144
  },
145
  {
146
  "cell_type": "code",
147
- "execution_count": 4,
148
  "id": "6cc5f2df",
149
  "metadata": {},
150
  "outputs": [
@@ -153,9 +174,9 @@
153
  "output_type": "stream",
154
  "text": [
155
  "Supported species: dict_keys(['<bos>', '<cls>', '<eos>', '<mask>', '<pad>', '<unk>', 'amphiprion_ocellaris', 'arabidopsis_thaliana', 'bison_bison_bison', 'caenorhabditis_elegans', 'canis_lupus_familiaris', 'chinchilla_lanigera', 'ciona_intestinalis', 'danio_rerio', 'drosophila_melanogaster', 'felis_catus', 'gallus_gallus', 'glycine_max', 'gorilla_gorilla', 'gossypium_hirsutum', 'human', 'macaca_nemestrina', 'mouse', 'oryza_sativa', 'rattus_norvegicus', 'salmo_trutta', 'serinus_canaria', 'tetraodon_nigroviridis', 'triticum_aestivum', 'zea_mays'])\n",
156
- "bigwig_tracks_logits: (2, 48, 7362)\n",
157
- "bed_tracks_logits: (2, 48, 21, 2)\n",
158
- "language model logits: (2, 128, 11)\n"
159
  ]
160
  }
161
  ],
@@ -166,12 +187,12 @@
166
  "tok_post = AutoTokenizer.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
167
  "model_post = AutoModel.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
168
  "\n",
169
- "# Prepare inputs\n",
170
- "batch = tok_post([\"ATCGNATCG\", \"ACGT\"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
171
  "\n",
172
  "# To show all supported species: \n",
173
  "print(\"Supported species:\", model_post.config.species_to_token_id.keys())\n",
174
- "# Species tokens\n",
175
  "species = ['human', 'mouse']\n",
176
  "species_ids = model_post.encode_species(species)\n",
177
  "\n",
@@ -188,14 +209,6 @@
188
  "# Language model logits for whole sequence over vocabulary\n",
189
  "print(\"language model logits:\", tuple(out[\"logits\"].shape))\n"
190
  ]
191
- },
192
- {
193
- "cell_type": "code",
194
- "execution_count": null,
195
- "id": "037076cd",
196
- "metadata": {},
197
- "outputs": [],
198
- "source": []
199
  }
200
  ],
201
  "metadata": {
 
80
  "print(\"torch_dtype:\", torch_dtype)"
81
  ]
82
  },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": 3,
86
+ "id": "ef0e6d69",
87
+ "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "name": "stdout",
91
+ "output_type": "stream",
92
+ "text": [
93
+ " Sequence lengths: [128, 512]\n"
94
+ ]
95
+ }
96
+ ],
97
+ "source": [
98
+ "# Dummy DNA sequences\n",
99
+ "seqs = [\n",
100
+ " \"ACGT\" * 32,\n",
101
+ " \"ACGT\" * 128\n",
102
+ "]\n",
103
+ "\n",
104
+ "print(\" Sequence lengths:\", [len(s) for s in seqs])"
105
+ ]
106
+ },
107
  {
108
  "cell_type": "markdown",
109
  "id": "82146876",
 
119
  },
120
  {
121
  "cell_type": "code",
122
+ "execution_count": null,
123
  "id": "336bb40c",
124
  "metadata": {},
125
  "outputs": [
 
127
  "name": "stdout",
128
  "output_type": "stream",
129
  "text": [
130
+ "MLM logits shape: (2, 512, 11)\n"
 
131
  ]
132
  }
133
  ],
 
138
  "tok_pre = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
139
  "model_pre = AutoModelForMaskedLM.from_pretrained(pretrained_model_name, trust_remote_code=True)\n",
140
  "\n",
141
+ "# Example inference\n",
142
+ "# Tokenization will pad all sequences to multiple of 128\n",
143
  "batch = tok_pre(seqs, add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
144
  "out = model_pre(**batch)\n",
145
  "\n",
 
 
146
  "# Access MLM logits\n",
147
  "mlm_logits = out[\"logits\"]\n",
148
  "print(\"MLM logits shape:\", tuple(mlm_logits.shape))"
 
165
  },
166
  {
167
  "cell_type": "code",
168
+ "execution_count": null,
169
  "id": "6cc5f2df",
170
  "metadata": {},
171
  "outputs": [
 
174
  "output_type": "stream",
175
  "text": [
176
  "Supported species: dict_keys(['<bos>', '<cls>', '<eos>', '<mask>', '<pad>', '<unk>', 'amphiprion_ocellaris', 'arabidopsis_thaliana', 'bison_bison_bison', 'caenorhabditis_elegans', 'canis_lupus_familiaris', 'chinchilla_lanigera', 'ciona_intestinalis', 'danio_rerio', 'drosophila_melanogaster', 'felis_catus', 'gallus_gallus', 'glycine_max', 'gorilla_gorilla', 'gossypium_hirsutum', 'human', 'macaca_nemestrina', 'mouse', 'oryza_sativa', 'rattus_norvegicus', 'salmo_trutta', 'serinus_canaria', 'tetraodon_nigroviridis', 'triticum_aestivum', 'zea_mays'])\n",
177
+ "bigwig_tracks_logits: (2, 192, 7362)\n",
178
+ "bed_tracks_logits: (2, 192, 21, 2)\n",
179
+ "language model logits: (2, 512, 11)\n"
180
  ]
181
  }
182
  ],
 
187
  "tok_post = AutoTokenizer.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
188
  "model_post = AutoModel.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
189
  "\n",
190
+ "# Prepare inputs - tokenization will pad all sequences to multiple of 128\n",
191
+ "batch = tok_post(seqs, add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
192
  "\n",
193
  "# To show all supported species: \n",
194
  "print(\"Supported species:\", model_post.config.species_to_token_id.keys())\n",
195
+ "# Species tokens (one per sequence)\n",
196
  "species = ['human', 'mouse']\n",
197
  "species_ids = model_post.encode_species(species)\n",
198
  "\n",
 
209
  "# Language model logits for whole sequence over vocabulary\n",
210
  "print(\"language model logits:\", tuple(out[\"logits\"].shape))\n"
211
  ]
 
 
 
 
 
 
 
 
212
  }
213
  ],
214
  "metadata": {
notebooks_tutorials/01_tracks_prediction.ipynb CHANGED
@@ -106,232 +106,6 @@
106
  "Set your NTv3 model and genomic window here"
107
  ]
108
  },
109
- {
110
- "cell_type": "code",
111
- "execution_count": null,
112
- "id": "6193fd37",
113
- "metadata": {},
114
- "outputs": [
115
- {
116
- "data": {
117
- "text/html": [
118
- "<div>\n",
119
- "<style scoped>\n",
120
- " .dataframe tbody tr th:only-of-type {\n",
121
- " vertical-align: middle;\n",
122
- " }\n",
123
- "\n",
124
- " .dataframe tbody tr th {\n",
125
- " vertical-align: top;\n",
126
- " }\n",
127
- "\n",
128
- " .dataframe thead th {\n",
129
- " text-align: right;\n",
130
- " }\n",
131
- "</style>\n",
132
- "<table border=\"1\" class=\"dataframe\">\n",
133
- " <thead>\n",
134
- " <tr style=\"text-align: right;\">\n",
135
- " <th></th>\n",
136
- " <th>file_id</th>\n",
137
- " <th>biosample_type</th>\n",
138
- " <th>tissue</th>\n",
139
- " <th>assay</th>\n",
140
- " <th>strand</th>\n",
141
- " <th>experiment_target</th>\n",
142
- " <th>specie</th>\n",
143
- " <th>dataset</th>\n",
144
- " </tr>\n",
145
- " </thead>\n",
146
- " <tbody>\n",
147
- " <tr>\n",
148
- " <th>0</th>\n",
149
- " <td>SRX20249461</td>\n",
150
- " <td>tissue</td>\n",
151
- " <td>Leaf (17 days)</td>\n",
152
- " <td>TF ChIP-seq</td>\n",
153
- " <td>NaN</td>\n",
154
- " <td>JMJ20</td>\n",
155
- " <td>glycine_max</td>\n",
156
- " <td>ncbi_chrom_acc</td>\n",
157
- " </tr>\n",
158
- " <tr>\n",
159
- " <th>1</th>\n",
160
- " <td>SRX20249462</td>\n",
161
- " <td>tissue</td>\n",
162
- " <td>Leaf (17 days)</td>\n",
163
- " <td>TF ChIP-seq</td>\n",
164
- " <td>NaN</td>\n",
165
- " <td>FLAG</td>\n",
166
- " <td>glycine_max</td>\n",
167
- " <td>ncbi_chrom_acc</td>\n",
168
- " </tr>\n",
169
- " <tr>\n",
170
- " <th>2</th>\n",
171
- " <td>SRX21859141</td>\n",
172
- " <td>tissue</td>\n",
173
- " <td>Seed (60 days)</td>\n",
174
- " <td>Histone ChIP-seq</td>\n",
175
- " <td>NaN</td>\n",
176
- " <td>H3K27me3</td>\n",
177
- " <td>glycine_max</td>\n",
178
- " <td>ncbi_chrom_acc</td>\n",
179
- " </tr>\n",
180
- " <tr>\n",
181
- " <th>3</th>\n",
182
- " <td>SRX21859142</td>\n",
183
- " <td>tissue</td>\n",
184
- " <td>Seed (60 days)</td>\n",
185
- " <td>Histone ChIP-seq</td>\n",
186
- " <td>NaN</td>\n",
187
- " <td>H3K27me3</td>\n",
188
- " <td>glycine_max</td>\n",
189
- " <td>ncbi_chrom_acc</td>\n",
190
- " </tr>\n",
191
- " <tr>\n",
192
- " <th>4</th>\n",
193
- " <td>SRX21859143</td>\n",
194
- " <td>tissue</td>\n",
195
- " <td>Seed (60 days)</td>\n",
196
- " <td>Histone ChIP-seq</td>\n",
197
- " <td>NaN</td>\n",
198
- " <td>H3K4me3</td>\n",
199
- " <td>glycine_max</td>\n",
200
- " <td>ncbi_chrom_acc</td>\n",
201
- " </tr>\n",
202
- " <tr>\n",
203
- " <th>...</th>\n",
204
- " <td>...</td>\n",
205
- " <td>...</td>\n",
206
- " <td>...</td>\n",
207
- " <td>...</td>\n",
208
- " <td>...</td>\n",
209
- " <td>...</td>\n",
210
- " <td>...</td>\n",
211
- " <td>...</td>\n",
212
- " </tr>\n",
213
- " <tr>\n",
214
- " <th>15884</th>\n",
215
- " <td>GSM874952</td>\n",
216
- " <td>Unknown</td>\n",
217
- " <td>NaN</td>\n",
218
- " <td>TF ChIP-seq</td>\n",
219
- " <td>NaN</td>\n",
220
- " <td>RPB2</td>\n",
221
- " <td>mouse</td>\n",
222
- " <td>geo</td>\n",
223
- " </tr>\n",
224
- " <tr>\n",
225
- " <th>15885</th>\n",
226
- " <td>GSM874953</td>\n",
227
- " <td>Unknown</td>\n",
228
- " <td>NaN</td>\n",
229
- " <td>TF ChIP-seq</td>\n",
230
- " <td>NaN</td>\n",
231
- " <td>RPB2</td>\n",
232
- " <td>mouse</td>\n",
233
- " <td>geo</td>\n",
234
- " </tr>\n",
235
- " <tr>\n",
236
- " <th>15886</th>\n",
237
- " <td>GSM874954</td>\n",
238
- " <td>Unknown</td>\n",
239
- " <td>NaN</td>\n",
240
- " <td>TF ChIP-seq</td>\n",
241
- " <td>NaN</td>\n",
242
- " <td>RPB2</td>\n",
243
- " <td>mouse</td>\n",
244
- " <td>geo</td>\n",
245
- " </tr>\n",
246
- " <tr>\n",
247
- " <th>15887</th>\n",
248
- " <td>GSM874955</td>\n",
249
- " <td>Unknown</td>\n",
250
- " <td>NaN</td>\n",
251
- " <td>TF ChIP-seq</td>\n",
252
- " <td>NaN</td>\n",
253
- " <td>RPB2</td>\n",
254
- " <td>mouse</td>\n",
255
- " <td>geo</td>\n",
256
- " </tr>\n",
257
- " <tr>\n",
258
- " <th>15888</th>\n",
259
- " <td>GSM874956</td>\n",
260
- " <td>Unknown</td>\n",
261
- " <td>NaN</td>\n",
262
- " <td>TF ChIP-seq</td>\n",
263
- " <td>NaN</td>\n",
264
- " <td>RPB2</td>\n",
265
- " <td>mouse</td>\n",
266
- " <td>geo</td>\n",
267
- " </tr>\n",
268
- " </tbody>\n",
269
- "</table>\n",
270
- "<p>15889 rows × 8 columns</p>\n",
271
- "</div>"
272
- ],
273
- "text/plain": [
274
- " file_id biosample_type tissue assay strand \\\n",
275
- "0 SRX20249461 tissue Leaf (17 days) TF ChIP-seq NaN \n",
276
- "1 SRX20249462 tissue Leaf (17 days) TF ChIP-seq NaN \n",
277
- "2 SRX21859141 tissue Seed (60 days) Histone ChIP-seq NaN \n",
278
- "3 SRX21859142 tissue Seed (60 days) Histone ChIP-seq NaN \n",
279
- "4 SRX21859143 tissue Seed (60 days) Histone ChIP-seq NaN \n",
280
- "... ... ... ... ... ... \n",
281
- "15884 GSM874952 Unknown NaN TF ChIP-seq NaN \n",
282
- "15885 GSM874953 Unknown NaN TF ChIP-seq NaN \n",
283
- "15886 GSM874954 Unknown NaN TF ChIP-seq NaN \n",
284
- "15887 GSM874955 Unknown NaN TF ChIP-seq NaN \n",
285
- "15888 GSM874956 Unknown NaN TF ChIP-seq NaN \n",
286
- "\n",
287
- " experiment_target specie dataset \n",
288
- "0 JMJ20 glycine_max ncbi_chrom_acc \n",
289
- "1 FLAG glycine_max ncbi_chrom_acc \n",
290
- "2 H3K27me3 glycine_max ncbi_chrom_acc \n",
291
- "3 H3K27me3 glycine_max ncbi_chrom_acc \n",
292
- "4 H3K4me3 glycine_max ncbi_chrom_acc \n",
293
- "... ... ... ... \n",
294
- "15884 RPB2 mouse geo \n",
295
- "15885 RPB2 mouse geo \n",
296
- "15886 RPB2 mouse geo \n",
297
- "15887 RPB2 mouse geo \n",
298
- "15888 RPB2 mouse geo \n",
299
- "\n",
300
- "[15889 rows x 8 columns]"
301
- ]
302
- },
303
- "execution_count": 15,
304
- "metadata": {},
305
- "output_type": "execute_result"
306
- }
307
- ],
308
- "source": [
309
- "import pandas as pd\n",
310
- "\n",
311
- "df = pd.read_csv(\"/Users/b.dealmeida/Downloads/Supplementary_tables - Post-training functional tracks.tsv\", sep=\"\\t\")"
312
- ]
313
- },
314
- {
315
- "cell_type": "code",
316
- "execution_count": 17,
317
- "id": "5f686ba9",
318
- "metadata": {},
319
- "outputs": [
320
- {
321
- "data": {
322
- "text/plain": [
323
- "2765"
324
- ]
325
- },
326
- "execution_count": 17,
327
- "metadata": {},
328
- "output_type": "execute_result"
329
- }
330
- ],
331
- "source": [
332
- "len(df.tissue.unique())"
333
- ]
334
- },
335
  {
336
  "cell_type": "code",
337
  "execution_count": null,
@@ -348,9 +122,9 @@
348
  "species = \"human\" # will use for condition the model on species\n",
349
  "assembly = \"hg38\" # will use for fetching the chromosome sequence\n",
350
  "chrom = \"chr19\"\n",
351
- "start = 6_749_152\n",
352
- "end = 6_781_920\n",
353
- "# Using center 32kb window (32,768 bp) for faster inference\n",
354
  "\n",
355
  "# Optional\n",
356
  "HF_TOKEN = os.getenv(\"HF_TOKEN\", None)"
 
106
  "Set your NTv3 model and genomic window here"
107
  ]
108
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  {
110
  "cell_type": "code",
111
  "execution_count": null,
 
122
  "species = \"human\" # will use for condition the model on species\n",
123
  "assembly = \"hg38\" # will use for fetching the chromosome sequence\n",
124
  "chrom = \"chr19\"\n",
125
+ "start = 6_700_000\n",
126
+ "end = 6_765_536\n",
127
+ "# Limiting to 65kb to work on Google Colab T4 GPU -> increase up to 1 million nucleotides if you have a better GPU\n",
128
  "\n",
129
  "# Optional\n",
130
  "HF_TOKEN = os.getenv(\"HF_TOKEN\", None)"
tabs/home.html CHANGED
@@ -94,9 +94,8 @@
94
  <h2>📓 Pipeline notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_pipelines" target="_blank" rel="noopener noreferrer">folder</a>)</h2>
95
  <ul>
96
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/01_functional_track_prediction.ipynb" target="_blank" rel="noopener noreferrer">🎯 01 — Generate bigwig predictions for certain tracks</a></li>
97
- <li>🎯 02 — Fine-tune on bigwig tracks</li>
98
- <li>🔍 03 — Interpret a given genomic region</li>
99
- <li>🧪 04 — Sequence generation <em>(coming soon)</em></li>
100
  </ul>
101
  </div>
102
  <div class="card">
 
94
  <h2>📓 Pipeline notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_pipelines" target="_blank" rel="noopener noreferrer">folder</a>)</h2>
95
  <ul>
96
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/01_functional_track_prediction.ipynb" target="_blank" rel="noopener noreferrer">🎯 01 — Generate bigwig predictions for certain tracks</a></li>
97
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/02_functional_interpretation.ipynb" target="_blank" rel="noopener noreferrer">🔍 02 — Interpret a given genomic region</a></li>
98
+ <li>🧪 03 — Sequence generation <em>(coming soon)</em></li>
 
99
  </ul>
100
  </div>
101
  <div class="card">