Spaces:

InstaDeepAI
/

ntv3

Running

App Files Files Community

bernardo-de-almeida commited on Dec 19, 2025

Commit

9759882

1 Parent(s): 1dc15bb

fix: notebooks

Browse files

Files changed (4) hide show

notebooks_pipelines/01_functional_track_prediction.ipynb +2 -2
notebooks_tutorials/00_quickstart_inference.ipynb +27 -186
notebooks_tutorials/01_tracks_prediction.ipynb +26 -41
tabs/home.html +1 -7

notebooks_pipelines/01_functional_track_prediction.ipynb CHANGED Viewed

@@ -64,13 +64,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "423af70a",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define the model and genomic window\n",
-    "model_name = \"InstaDeepAI/NTv3_650M_pos\"\n",
     "\n",
     "species = \"human\"  # will use for condition the model on species\n",
     "assembly = \"hg38\"  # will use for fetching the chromosome sequence\n",

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "423af70a",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define the model and genomic window\n",
+    "model_name = \"InstaDeepAI/NTv3_650M_post\"\n",
     "\n",
     "species = \"human\"  # will use for condition the model on species\n",
     "assembly = \"hg38\"  # will use for fetching the chromosome sequence\n",

notebooks_tutorials/00_quickstart_inference.ipynb CHANGED Viewed

@@ -10,7 +10,7 @@
         "This notebook demonstrates how to run **quick inference** with both the pre- and post-trained NTv3 checkpoints:\n",
         "\n",
         "- **Pre-trained (MLM-focused):** `InstaDeepAI/NTv3_8M_pre`, `InstaDeepAI/NTv3_100M_pre`, `InstaDeepAI/NTv3_650M_pre`\n",
-        "- **Post-trained (functional tracks and genome annotation):** `InstaDeepAI/NTv3_100M_pos`, `InstaDeepAI/NTv3_650M_pos`\n",
         "\n",
         "We show how to:\n",
         "\n",
@@ -31,7 +31,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "id": "38cc32a9",
       "metadata": {},
       "outputs": [],
@@ -41,7 +41,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
       "id": "d56c105b",
       "metadata": {},
       "outputs": [
@@ -95,156 +95,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "id": "336bb40c",
       "metadata": {},
       "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "411ee47e94ae467f9685c35b65e3e52d",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "30447edb44b849bd936290f3a6b1b863",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "tokenization_ntv3.py:   0%|          | 0.00/12.0k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/ntv3_base_model:\n",
-            "- tokenization_ntv3.py\n",
-            ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "766f183dcc84421588e5cf0241d3efe7",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "vocab.json:   0%|          | 0.00/138 [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "b0db83f7cb824d3288a30bebf7891a63",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "special_tokens_map.json:   0%|          | 0.00/149 [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "33cf5391dcc549f088e4e927651d1cdb",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "config.json:   0%|          | 0.00/1.70k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "85772d5369234ca286cfa518e1725b12",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "configuration_ntv3.py:   0%|          | 0.00/5.90k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/ntv3_base_model:\n",
-            "- configuration_ntv3.py\n",
-            ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "ec1153d073e444c5b255ee5adea6ba68",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "modeling_ntv3_base.py:   0%|          | 0.00/33.9k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/ntv3_base_model:\n",
-            "- modeling_ntv3_base.py\n",
-            ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "94b9bb7fe0da4f4994adb9127d9af7e6",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model.safetensors:   0%|          | 0.00/30.8M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
             "torch.Size([2, 128, 11])\n",
-            "16\n",
-            "2\n",
             "MLM logits shape: (2, 128, 11)\n"
           ]
         }
@@ -259,11 +118,9 @@
         "# Example: human sequence\n",
         "seqs = [\"ATCGNATCG\", \"ACGT\"]\n",
         "batch = tok_pre(seqs, add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
-        "out = model_pre(**batch, output_hidden_states=True, output_attentions=True)\n",
         "\n",
         "print(out.logits.shape)       # (B, L, V = 11)\n",
-        "print(len(out.hidden_states)) # convs + transformers + deconvs\n",
-        "print(len(out.attentions))\n",
         "\n",
         "# Access MLM logits\n",
         "mlm_logits = out[\"logits\"]\n",
@@ -279,10 +136,6 @@
         "\n",
         "Post-trained checkpoints add task-specific heads for functional track prediction and genome annotation.\n",
         "\n",
-        "In particular:\n",
-        "- `species_tokenizer` is used to tokenize a species condition like `\"human\"`\n",
-        "- `species_ids` passes the species tokens to the model\n",
-        "\n",
         "Expected outputs:\n",
         "- `bigwig_tracks_logits`: functional track predictions\n",
         "- `bed_tracks_logits`: genome annotation predictions\n",
@@ -291,31 +144,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
-      "id": "bdb8c4d1",
-      "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Model supported species: TO BE DONE\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Inspect config and supported species\n",
-        "post_trained_model_name = \"InstaDeepAI/NTv3_100M_pos\"\n",
-        "\n",
-        "cfg_post = AutoConfig.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
-        "\n",
-        "species = \"TO BE DONE\"\n",
-        "print(\"Model supported species:\", species)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
       "id": "6cc5f2df",
       "metadata": {},
       "outputs": [
@@ -323,29 +152,33 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "torch.Size([1, 768, 7362])\n",
-            "torch.Size([1, 768, 21, 2])\n",
-            "torch.Size([1, 2048, 11])\n"
           ]
         }
       ],
       "source": [
         "tok_post = AutoTokenizer.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
-        "cond_tok_post = AutoTokenizer.from_pretrained(post_trained_model_name, subfolder='species_tokenizer', trust_remote_code=True)\n",
         "model_post = AutoModel.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
         "\n",
         "# Prepare inputs\n",
         "batch = tok_post([\"ATCGNATCG\", \"ACGT\"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
         "\n",
-        "# Condition tokens (e.g., species)\n",
-        "species = 'human'\n",
-        "species_ids = cond_tok_post([species] * len(batch['input_ids']), add_special_tokens=False, return_tensors='pt')\n",
         "\n",
         "# Forward pass\n",
         "out = model_post(\n",
         "    input_ids=batch[\"input_ids\"],\n",
-        "    species_ids=species_ids['input_ids'],\n",
-        "    return_dict=True\n",
         ")\n",
         "\n",
         "# 7k human tracks over 37.5 % center region of the input sequence\n",
@@ -355,6 +188,14 @@
         "# Language model logits for whole sequence over vocabulary\n",
         "print(\"language model logits:\", tuple(out[\"logits\"].shape))\n"
       ]
     }
   ],
   "metadata": {

         "This notebook demonstrates how to run **quick inference** with both the pre- and post-trained NTv3 checkpoints:\n",
         "\n",
         "- **Pre-trained (MLM-focused):** `InstaDeepAI/NTv3_8M_pre`, `InstaDeepAI/NTv3_100M_pre`, `InstaDeepAI/NTv3_650M_pre`\n",
+        "- **Post-trained (functional tracks and genome annotation):** `InstaDeepAI/NTv3_100M_post`, `InstaDeepAI/NTv3_650M_post`\n",
         "\n",
         "We show how to:\n",
         "\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 1,
       "id": "38cc32a9",
       "metadata": {},
       "outputs": [],
     },
     {
       "cell_type": "code",
+      "execution_count": 2,
       "id": "d56c105b",
       "metadata": {},
       "outputs": [
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
       "id": "336bb40c",
       "metadata": {},
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
             "torch.Size([2, 128, 11])\n",
             "MLM logits shape: (2, 128, 11)\n"
           ]
         }
         "# Example: human sequence\n",
         "seqs = [\"ATCGNATCG\", \"ACGT\"]\n",
         "batch = tok_pre(seqs, add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
+        "out = model_pre(**batch)\n",
         "\n",
         "print(out.logits.shape)       # (B, L, V = 11)\n",
         "\n",
         "# Access MLM logits\n",
         "mlm_logits = out[\"logits\"]\n",
         "\n",
         "Post-trained checkpoints add task-specific heads for functional track prediction and genome annotation.\n",
         "\n",
         "Expected outputs:\n",
         "- `bigwig_tracks_logits`: functional track predictions\n",
         "- `bed_tracks_logits`: genome annotation predictions\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 4,
       "id": "6cc5f2df",
       "metadata": {},
       "outputs": [
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "Supported species: dict_keys(['<bos>', '<cls>', '<eos>', '<mask>', '<pad>', '<unk>', 'amphiprion_ocellaris', 'arabidopsis_thaliana', 'bison_bison_bison', 'caenorhabditis_elegans', 'canis_lupus_familiaris', 'chinchilla_lanigera', 'ciona_intestinalis', 'danio_rerio', 'drosophila_melanogaster', 'felis_catus', 'gallus_gallus', 'glycine_max', 'gorilla_gorilla', 'gossypium_hirsutum', 'human', 'macaca_nemestrina', 'mouse', 'oryza_sativa', 'rattus_norvegicus', 'salmo_trutta', 'serinus_canaria', 'tetraodon_nigroviridis', 'triticum_aestivum', 'zea_mays'])\n",
+            "bigwig_tracks_logits: (2, 48, 7362)\n",
+            "bed_tracks_logits: (2, 48, 21, 2)\n",
+            "language model logits: (2, 128, 11)\n"
           ]
         }
       ],
       "source": [
+        "# Load model\n",
+        "post_trained_model_name = \"InstaDeepAI/NTv3_100M_post\"\n",
+        "\n",
         "tok_post = AutoTokenizer.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
         "model_post = AutoModel.from_pretrained(post_trained_model_name, trust_remote_code=True)\n",
         "\n",
         "# Prepare inputs\n",
         "batch = tok_post([\"ATCGNATCG\", \"ACGT\"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors=\"pt\")\n",
         "\n",
+        "# To show all supported species: \n",
+        "print(\"Supported species:\", model_post.config.species_to_token_id.keys())\n",
+        "# Species tokens\n",
+        "species = ['human', 'mouse']\n",
+        "species_ids = model_post.encode_species(species)\n",
         "\n",
         "# Forward pass\n",
         "out = model_post(\n",
         "    input_ids=batch[\"input_ids\"],\n",
+        "    species_ids=species_ids,\n",
         ")\n",
         "\n",
         "# 7k human tracks over 37.5 % center region of the input sequence\n",
         "# Language model logits for whole sequence over vocabulary\n",
         "print(\"language model logits:\", tuple(out[\"logits\"].shape))\n"
       ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "037076cd",
+      "metadata": {},
+      "outputs": [],
+      "source": []
     }
   ],
   "metadata": {

notebooks_tutorials/01_tracks_prediction.ipynb CHANGED Viewed

@@ -116,7 +116,7 @@
         "# -----------------------------\n",
         "# User inputs\n",
         "# -----------------------------\n",
-        "model_name = \"InstaDeepAI/NTv3_100M_pos\" # options: \"InstaDeepAI/NTv3_100M_pos\" or \"InstaDeepAI/NTv3_650M_pos\"\n",
         "\n",
         "# Example window from a given species (edit these) - needs to be multiple of 128 due to the model downsampling\n",
         "species = \"human\"  # will use for condition the model on species\n",
@@ -173,22 +173,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
       "id": "e09f0469",
       "metadata": {},
       "outputs": [
         {
           "data": {
             "text/plain": [
-              "NTv3Model(\n",
-              "  (core): Core(\n",
-              "    (embed_layer): Embedding(11, 16, padding_idx=1)\n",
               "    (stem): Stem(\n",
               "      (conv): Conv1d(16, 768, kernel_size=(15,), stride=(1,), padding=same)\n",
               "    )\n",
-              "    (cond_tables): ModuleList(\n",
-              "      (0): Embedding(30, 16)\n",
-              "    )\n",
               "    (conv_tower_blocks): ModuleList(\n",
               "      (0-6): 7 x ConditionedConvTowerBlock(\n",
               "        (conv): AdaptiveConvBlock(\n",
@@ -279,6 +276,16 @@
               "        )\n",
               "      )\n",
               "    )\n",
               "    (bigwig_head): MultiSpeciesHead(\n",
               "      (species_heads): ModuleList(\n",
               "        (0-4): 5 x ZeroHead()\n",
@@ -329,13 +336,6 @@
               "      (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
               "      (head): Linear(in_features=768, out_features=42, bias=True)\n",
               "    )\n",
-              "    (conditions_heads): ModuleList(\n",
-              "      (0): Linear(in_features=768, out_features=30, bias=True)\n",
-              "    )\n",
-              "    (lm_head): ModuleDict(\n",
-              "      (hidden_layers): ModuleList()\n",
-              "      (head): Linear(in_features=768, out_features=11, bias=True)\n",
-              "    )\n",
               "  )\n",
               ")"
             ]
@@ -347,24 +347,18 @@
       ],
       "source": [
         "# Load model\n",
-        "cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\n",
         "model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)\n",
         "\n",
         "# Load tokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
         "\n",
-        "# Load condition tokenizer\n",
-        "species_tokenizer = AutoTokenizer.from_pretrained(\n",
-        "    model_name, subfolder=\"species_tokenizer\", trust_remote_code=True,\n",
-        ")\n",
-        "\n",
         "# Set model to evaluation mode\n",
         "model.eval()"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
       "id": "43154959",
       "metadata": {},
       "outputs": [
@@ -372,15 +366,16 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "7362 functional tracks for hg38. First 10: ['kai1', 'kai2', 'kai3', 'kai4', 'kai5', 'kai6', 'kai7', 'kai8', 'kai10', 'kai9']\n",
             "Genomic elements predicted: ['protein_coding_gene', 'lncRNA', 'exon', 'intron', 'splice_donor', 'splice_acceptor', 'CTCF-bound', 'polyA_signal', 'enhancer_Tissue_specific', 'enhancer_Tissue_invariant', 'promoter_Tissue_specific', 'promoter_Tissue_invariant', '5UTR+', '5UTR-', '3UTR+', '3UTR-', 'skipped_exon', 'always_on_exon', 'start_codon', 'stop_codon', 'ORF']\n"
           ]
         }
       ],
       "source": [
         "# Inspect output functional tracks\n",
-        "bigwig_names = cfg.bigwigs_per_file_assembly[assembly]\n",
-        "print(f\"{len(bigwig_names)} functional tracks for {assembly}. First 10:\", bigwig_names[:10])\n",
         "\n",
         "# Inspect output genomic elements\n",
         "bed_element_names = cfg.bed_elements_names\n",
@@ -408,7 +403,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
       "id": "6765a9b9",
       "metadata": {},
       "outputs": [
@@ -429,13 +424,12 @@
         "\n",
         "# Condition tokens (e.g., species)\n",
         "species = 'human'\n",
-        "species_ids = species_tokenizer([species] * len(batch['input_ids']), add_special_tokens=False, return_tensors='pt')\n",
         "\n",
         "# Run inference\n",
         "out = model(\n",
         "    input_ids=input_ids,\n",
-        "    species_ids=species_ids['input_ids'],\n",
-        "    return_dict=True\n",
         ")\n",
         "\n",
         "# 7k human tracks over 37.5 % center region of the input sequence\n",
@@ -465,7 +459,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
       "id": "a26e9dcc",
       "metadata": {},
       "outputs": [],
@@ -482,7 +476,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
       "id": "717539e2",
       "metadata": {},
       "outputs": [],
@@ -527,7 +521,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 12,
       "id": "7ba9a397",
       "metadata": {},
       "outputs": [
@@ -577,15 +571,6 @@
         "plot_tracks(all_tracks, prediction_start, prediction_end)\n",
         "plt.show()\n"
       ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "1ce34dc4",
-      "metadata": {},
-      "source": [
-        "# 💡 To improve\n",
-        "- Add gene annotation at top"
-      ]
     }
   ],
   "metadata": {

         "# -----------------------------\n",
         "# User inputs\n",
         "# -----------------------------\n",
+        "model_name = \"InstaDeepAI/NTv3_100M_post\" # options: \"InstaDeepAI/NTv3_100M_post\" or \"InstaDeepAI/NTv3_650M_post\"\n",
         "\n",
         "# Example window from a given species (edit these) - needs to be multiple of 128 due to the model downsampling\n",
         "species = \"human\"  # will use for condition the model on species\n",
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "id": "e09f0469",
       "metadata": {},
       "outputs": [
         {
           "data": {
             "text/plain": [
+              "NTv3PostTrained(\n",
+              "  (core): NTv3PostTrainedCore(\n",
+              "    (embed_layer): Embedding(11, 16)\n",
               "    (stem): Stem(\n",
               "      (conv): Conv1d(16, 768, kernel_size=(15,), stride=(1,), padding=same)\n",
               "    )\n",
               "    (conv_tower_blocks): ModuleList(\n",
               "      (0-6): 7 x ConditionedConvTowerBlock(\n",
               "        (conv): AdaptiveConvBlock(\n",
               "        )\n",
               "      )\n",
               "    )\n",
+              "    (lm_head): ModuleDict(\n",
+              "      (hidden_layers): ModuleList()\n",
+              "      (head): Linear(in_features=768, out_features=11, bias=True)\n",
+              "    )\n",
+              "    (cond_tables): ModuleList(\n",
+              "      (0): Embedding(30, 16)\n",
+              "    )\n",
+              "    (conditions_heads): ModuleList(\n",
+              "      (0): Linear(in_features=768, out_features=30, bias=True)\n",
+              "    )\n",
               "    (bigwig_head): MultiSpeciesHead(\n",
               "      (species_heads): ModuleList(\n",
               "        (0-4): 5 x ZeroHead()\n",
               "      (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
               "      (head): Linear(in_features=768, out_features=42, bias=True)\n",
               "    )\n",
               "  )\n",
               ")"
             ]
       ],
       "source": [
         "# Load model\n",
         "model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)\n",
         "\n",
         "# Load tokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
         "\n",
         "# Set model to evaluation mode\n",
         "model.eval()"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 10,
       "id": "43154959",
       "metadata": {},
       "outputs": [
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "7362 functional tracks for human. First 10: ['kai1', 'kai2', 'kai3', 'kai4', 'kai5', 'kai6', 'kai7', 'kai8', 'kai10', 'kai9']\n",
             "Genomic elements predicted: ['protein_coding_gene', 'lncRNA', 'exon', 'intron', 'splice_donor', 'splice_acceptor', 'CTCF-bound', 'polyA_signal', 'enhancer_Tissue_specific', 'enhancer_Tissue_invariant', 'promoter_Tissue_specific', 'promoter_Tissue_invariant', '5UTR+', '5UTR-', '3UTR+', '3UTR-', 'skipped_exon', 'always_on_exon', 'start_codon', 'stop_codon', 'ORF']\n"
           ]
         }
       ],
       "source": [
         "# Inspect output functional tracks\n",
+        "cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\n",
+        "bigwig_names = cfg.bigwigs_per_species[species]\n",
+        "print(f\"{len(bigwig_names)} functional tracks for {species}. First 10:\", bigwig_names[:10])\n",
         "\n",
         "# Inspect output genomic elements\n",
         "bed_element_names = cfg.bed_elements_names\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 11,
       "id": "6765a9b9",
       "metadata": {},
       "outputs": [
         "\n",
         "# Condition tokens (e.g., species)\n",
         "species = 'human'\n",
+        "species_ids = model.encode_species(species)\n",
         "\n",
         "# Run inference\n",
         "out = model(\n",
         "    input_ids=input_ids,\n",
+        "    species_ids=species_ids,\n",
         ")\n",
         "\n",
         "# 7k human tracks over 37.5 % center region of the input sequence\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 12,
       "id": "a26e9dcc",
       "metadata": {},
       "outputs": [],
     },
     {
       "cell_type": "code",
+      "execution_count": 13,
       "id": "717539e2",
       "metadata": {},
       "outputs": [],
     },
     {
       "cell_type": "code",
+      "execution_count": 14,
       "id": "7ba9a397",
       "metadata": {},
       "outputs": [
         "plot_tracks(all_tracks, prediction_start, prediction_end)\n",
         "plt.show()\n"
       ]
     }
   ],
   "metadata": {

tabs/home.html CHANGED Viewed

@@ -125,16 +125,10 @@ tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 batch = tok(["ATCGNATCG", "ACGT"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")
 # Run model
-out = model(
-  **batch,
-  output_hidden_states=True,
-  output_attentions=True
-)
 # Print output shapes
 print(out.logits.shape)       # (B, L, V = 11)
-print(len(out.hidden_states)) # convs + transformers + deconvs
-print(len(out.attentions))    # equals transformer layers = 12
 </code></pre></div>
     <p>Model embeddings can be used for fine-tuning on downstream tasks.</p>

 batch = tok(["ATCGNATCG", "ACGT"], add_special_tokens=False, padding=True, pad_to_multiple_of=128, return_tensors="pt")
 # Run model
+out = model(**batch)
 # Print output shapes
 print(out.logits.shape)       # (B, L, V = 11)
 </code></pre></div>
     <p>Model embeddings can be used for fine-tuning on downstream tasks.</p>