Spaces:

InstaDeepAI
/

ntv3

Running

App Files Files Community

ybornachot commited on Dec 10, 2025

Commit

6e05130

1 Parent(s): b04b4fa

fix: simplified data download + loading

Browse files

Files changed (1) hide show

notebooks/03_fine_tuning.ipynb +122 -251

notebooks/03_fine_tuning.ipynb CHANGED Viewed

@@ -21,30 +21,23 @@
       "outputs": [],
       "source": [
         "# Install useful dependencies\n",
-        "# !pip install -r requirements.txt"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/home/y-bornachot/venvs/ntv3-env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-            "  from .autonotebook import tqdm as notebook_tqdm\n"
-          ]
-        }
-      ],
       "source": [
         "# 0. Imports\n",
         "import random\n",
         "import functools\n",
         "from typing import List, Dict, Optional, Callable\n",
-        "import pyBigWig\n",
-        "from pyfaidx import Fasta\n",
         "\n",
         "import torch\n",
         "import torch.nn as nn\n",
@@ -54,6 +47,8 @@
         "from torch.optim.lr_scheduler import LambdaLR\n",
         "from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer\n",
         "import numpy as np\n",
         "from torchmetrics import PearsonCorrCoef"
       ]
     },
@@ -66,7 +61,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
       "metadata": {},
       "outputs": [
         {
@@ -81,11 +76,12 @@
         "config = {\n",
         "    # Model\n",
         "    \"model_name\": \"InstaDeepAI/ntv3_8M_7downsample_pretrained_le_1mb\",  # NTv3 model\n",
-        "    \"pretrained\": True,\n",
         "    \n",
         "    # Data\n",
         "    \"sequence_length\": 1_024,\n",
-        "    \"bigwig_file_ids\": [\"ENCFF884LDL\"],  # Example track names\n",
         "    \"keep_target_center_fraction\": 0.375,\n",
         "    \n",
         "    # Training\n",
@@ -115,10 +111,34 @@
         "    \"num_workers\": 0,  # Number of worker processes for DataLoader\n",
         "}\n",
         "\n",
         "# Set random seed\n",
         "torch.manual_seed(config[\"seed\"])\n",
         "np.random.seed(config[\"seed\"])\n",
         "\n",
         "device = torch.device(config[\"device\"])\n",
         "print(f\"Using device: {device}\")"
       ]
@@ -132,73 +152,82 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "!wget -c https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz \\\n",
-        "&& gunzip -f GCF_000001405.40_GRCh38.p14_genomic.fna.gz"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
-        "!wget -O ENCFF884LDL \"$(curl -s https://www.encodeproject.org/files/ENCFF884LDL/@@download/ENCFF884LDL | sed -n 's/.*href=\\\"\\([^\\\"]*ENCFF884LDL[^\\\"]*\\)\\\".*/\\1/p')\" \\\n",
-        "&& echo \"Downloaded ENCFF884LDL\""
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
-        "!wget -c https://www.encodeproject.org/files/ENCFF884LDL/@@download/ENCFF884LDL.bigWig"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
       "metadata": {},
       "outputs": [],
       "source": [
-        "chrom_mapping = {\n",
-        "    \"chr1\":  \"NC_000001.11\",\n",
-        "    \"chr2\":  \"NC_000002.12\",\n",
-        "    \"chr3\":  \"NC_000003.12\",\n",
-        "    \"chr4\":  \"NC_000004.12\",\n",
-        "    \"chr5\":  \"NC_000005.10\",\n",
-        "    \"chr6\":  \"NC_000006.12\",\n",
-        "    \"chr7\":  \"NC_000007.14\",\n",
-        "    \"chr8\":  \"NC_000008.11\",\n",
-        "    \"chr9\":  \"NC_000009.12\",\n",
-        "    \"chr10\": \"NC_000010.11\",\n",
-        "    \"chr11\": \"NC_000011.10\",\n",
-        "    \"chr12\": \"NC_000012.12\",\n",
-        "    \"chr13\": \"NC_000013.11\",\n",
-        "    \"chr14\": \"NC_000014.9\",\n",
-        "    \"chr15\": \"NC_000015.10\",\n",
-        "    \"chr16\": \"NC_000016.10\",\n",
-        "    \"chr17\": \"NC_000017.11\",\n",
-        "    \"chr18\": \"NC_000018.10\",\n",
-        "    \"chr19\": \"NC_000019.10\",\n",
-        "    \"chr20\": \"NC_000020.11\",\n",
-        "    \"chr21\": \"NC_000021.9\",\n",
-        "    \"chr22\": \"NC_000022.11\",\n",
-        "    \"chrX\":  \"NC_000023.11\",\n",
-        "    \"chrY\":  \"NC_000024.10\",\n",
-        "    # mitochondrial\n",
-        "    \"chrM\":  \"NC_012920.1\",\n",
-        "    \"chrMT\": \"NC_012920.1\",\n",
-        "}\n",
-        "\n",
         "chrom_splits = {\n",
-        "    \"train\": [f\"chr{i}\" for i in range(1, 19)],\n",
-        "    \"val\": [f\"chr{i}\" for i in range(19, 21)],\n",
-        "    \"test\": [f\"chr{i}\" for i in range(21, 23)],\n",
         "}"
       ]
     },
@@ -211,7 +240,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -237,24 +266,16 @@
         "        model_name: str,\n",
         "        bigwig_track_names: List[str],\n",
         "        keep_target_center_fraction: float = 0.375,\n",
-        "        pretrained: bool = True,\n",
         "    ):\n",
         "        super().__init__()\n",
         "        \n",
         "        # Load config and model\n",
         "        self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\n",
-        "\n",
-        "        if pretrained:\n",
-        "            self.backbone = AutoModelForMaskedLM.from_pretrained(\n",
-        "                model_name, \n",
-        "                trust_remote_code=True,\n",
-        "                config=self.config\n",
-        "            )\n",
-        "        else:\n",
-        "            self.backbone = AutoModelForMaskedLM.from_config(\n",
-        "                self.config, \n",
-        "                trust_remote_code=True\n",
-        "            )\n",
         "        \n",
         "        self.keep_target_center_fraction = keep_target_center_fraction\n",
         "\n",
@@ -287,7 +308,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
       "metadata": {},
       "outputs": [
         {
@@ -314,7 +335,6 @@
         "    model_name=config[\"model_name\"],\n",
         "    bigwig_track_names=config[\"bigwig_file_ids\"],\n",
         "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
-        "    pretrained=config[\"pretrained\"],\n",
         ")\n",
         "model = model.to(device)\n",
         "model.train()\n",
@@ -333,7 +353,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 34,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -377,7 +397,6 @@
         "        sequence_length: int,\n",
         "        num_samples: int,\n",
         "        tokenizer: AutoTokenizer,\n",
-        "        chrom_mapping: Optional[Dict[str, str]] = None,\n",
         "        keep_target_center_fraction: float = 1.0,\n",
         "        num_tracks: int = 1,\n",
         "    ):\n",
@@ -393,9 +412,7 @@
         "        self.tokenizer = tokenizer\n",
         "        self.keep_target_center_fraction = keep_target_center_fraction\n",
         "        self.num_tracks = num_tracks\n",
-        "\n",
         "        self.chroms = chroms\n",
-        "        self.chrom_mapping = chrom_mapping or {c: c for c in chroms}\n",
         "\n",
         "        # Intersect lengths between FASTA and bigWig for safety\n",
         "        bw_chrom_lengths = self.bw_list[0].chroms()  # dict: chrom -> length\n",
@@ -404,13 +421,10 @@
         "        self.chrom_lengths = {}\n",
         "\n",
         "        for c in chroms:\n",
-        "            if c not in bw_chrom_lengths:\n",
-        "                continue\n",
-        "            fa_name = self.chrom_mapping.get(c, c)\n",
-        "            if fa_name not in self.fasta:\n",
         "                continue\n",
         "\n",
-        "            fa_len = len(self.fasta[fa_name])\n",
         "            bw_len = bw_chrom_lengths[c]\n",
         "            L = min(fa_len, bw_len)\n",
         "\n",
@@ -433,11 +447,8 @@
         "        start = random.randint(0, max_start)\n",
         "        end = start + self.sequence_length\n",
         "\n",
-        "        # FASTA chromosome name may differ\n",
-        "        fa_chrom = self.chrom_mapping.get(chrom, chrom)\n",
-        "\n",
         "        # Sequence\n",
-        "        seq = self.fasta[fa_chrom][start:end]  # string slice\n",
         "        tokens = self.tokenizer(\n",
         "            seq,\n",
         "            return_tensors=\"pt\",  # Returns a dict of PyTorch tensors\n",
@@ -475,7 +486,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 35,
       "metadata": {},
       "outputs": [
         {
@@ -489,16 +500,12 @@
         }
       ],
       "source": [
-        "fasta_path = \"./GCF_000001405.40_GRCh38.p14_genomic.fna\"\n",
-        "bigwig_path_list = [\"./ENCFF884LDL.bigWig\"]\n",
-        "\n",
         "create_dataset_fn = functools.partial(\n",
         "    GenomeBigWigDataset,\n",
         "    fasta_path=fasta_path,\n",
         "    bigwig_path_list=bigwig_path_list,\n",
         "    sequence_length=config[\"sequence_length\"],\n",
         "    tokenizer=tokenizer,\n",
-        "    chrom_mapping=chrom_mapping,\n",
         "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
         "    num_tracks=len(config[\"bigwig_file_ids\"]),\n",
         ")\n",
@@ -554,7 +561,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 36,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -582,29 +589,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 37,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Gradient accumulation steps: 2\n",
-            "Effective batch size: 4\n",
-            "Effective tokens per update: 4096\n",
-            "\n",
-            "Training constants:\n",
-            "  Total training steps: 32\n",
-            "  Log training metrics every: 2 steps\n",
-            "  Run validation every: 4 steps\n",
-            "  Warmup steps: 3\n",
-            "\n",
-            "Optimizer setup:\n",
-            "  Initial LR: 1e-05\n",
-            "  Peak LR: 5e-05\n"
-          ]
-        }
-      ],
       "source": [
         "# Calculate gradient accumulation steps and effective batch size\n",
         "num_devices = 1  # Single device for now\n",
@@ -676,7 +663,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 38,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -766,7 +753,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 39,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -784,17 +771,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 40,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Scaling functions created\n"
-          ]
-        }
-      ],
       "source": [
         "def get_track_means(bigwig_file_ids: List[str]) -> np.ndarray:\n",
         "    \"\"\"\n",
@@ -920,7 +899,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 41,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -996,7 +975,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 42,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1082,77 +1061,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 43,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Starting training...\n",
-            "Training for 32 steps with 2 gradient accumulation steps\n",
-            "\n",
-            "Step 1/32 | Loss: 0.7569 | Mean Pearson: -0.1473 | LR: 1.17e-09 | Tokens: 4,096\n",
-            "\n",
-            "Running validation at step 0...\n",
-            "  Validation Loss: 1.0152\n",
-            "  Validation Mean Pearson: -0.0414\n",
-            "    ENCFF884LDL/pearson: -0.0414\n",
-            "Step 3/32 | Loss: 0.3793 | Mean Pearson: -0.0229 | LR: 2.50e-09 | Tokens: 12,288\n",
-            "Step 5/32 | Loss: 0.4111 | Mean Pearson: -0.1739 | LR: 2.41e-09 | Tokens: 20,480\n",
-            "\n",
-            "Running validation at step 4...\n",
-            "  Validation Loss: 0.4801\n",
-            "  Validation Mean Pearson: 0.0120\n",
-            "    ENCFF884LDL/pearson: 0.0120\n",
-            "Step 7/32 | Loss: 0.3404 | Mean Pearson: -0.0191 | LR: 2.32e-09 | Tokens: 28,672\n",
-            "Step 9/32 | Loss: 0.3950 | Mean Pearson: 0.0090 | LR: 2.23e-09 | Tokens: 36,864\n",
-            "\n",
-            "Running validation at step 8...\n",
-            "  Validation Loss: 0.5865\n",
-            "  Validation Mean Pearson: -0.0260\n",
-            "    ENCFF884LDL/pearson: -0.0260\n",
-            "Step 11/32 | Loss: 0.3750 | Mean Pearson: 0.0121 | LR: 2.13e-09 | Tokens: 45,056\n",
-            "Step 13/32 | Loss: 0.4380 | Mean Pearson: -0.0126 | LR: 2.02e-09 | Tokens: 53,248\n",
-            "\n",
-            "Running validation at step 12...\n",
-            "  Validation Loss: 0.3997\n",
-            "  Validation Mean Pearson: 0.0093\n",
-            "    ENCFF884LDL/pearson: 0.0093\n",
-            "Step 15/32 | Loss: 0.3469 | Mean Pearson: -0.0279 | LR: 1.91e-09 | Tokens: 61,440\n",
-            "Step 17/32 | Loss: 0.5098 | Mean Pearson: -0.2044 | LR: 1.80e-09 | Tokens: 69,632\n",
-            "\n",
-            "Running validation at step 16...\n",
-            "  Validation Loss: 0.3752\n",
-            "  Validation Mean Pearson: -0.0178\n",
-            "    ENCFF884LDL/pearson: -0.0178\n",
-            "Step 19/32 | Loss: 0.4899 | Mean Pearson: -0.0424 | LR: 1.67e-09 | Tokens: 77,824\n",
-            "Step 21/32 | Loss: 0.3889 | Mean Pearson: -0.0332 | LR: 1.54e-09 | Tokens: 86,016\n",
-            "\n",
-            "Running validation at step 20...\n",
-            "  Validation Loss: 0.4217\n",
-            "  Validation Mean Pearson: -0.0205\n",
-            "    ENCFF884LDL/pearson: -0.0205\n",
-            "Step 23/32 | Loss: 0.3392 | Mean Pearson: 0.0235 | LR: 1.39e-09 | Tokens: 94,208\n",
-            "Step 25/32 | Loss: 0.4165 | Mean Pearson: 0.0033 | LR: 1.23e-09 | Tokens: 102,400\n",
-            "\n",
-            "Running validation at step 24...\n",
-            "  Validation Loss: 0.4363\n",
-            "  Validation Mean Pearson: -0.0379\n",
-            "    ENCFF884LDL/pearson: -0.0379\n",
-            "Step 27/32 | Loss: 0.7630 | Mean Pearson: 0.0683 | LR: 1.04e-09 | Tokens: 110,592\n",
-            "Step 29/32 | Loss: 0.7357 | Mean Pearson: 0.0050 | LR: 8.04e-10 | Tokens: 118,784\n",
-            "\n",
-            "Running validation at step 28...\n",
-            "  Validation Loss: 0.6629\n",
-            "  Validation Mean Pearson: -0.0370\n",
-            "    ENCFF884LDL/pearson: -0.0370\n",
-            "Step 31/32 | Loss: 0.3690 | Mean Pearson: -0.0808 | LR: 4.64e-10 | Tokens: 126,976\n",
-            "\n",
-            "Training completed after 32 steps!\n"
-          ]
-        }
-      ],
       "source": [
         "# Training loop (step-based with gradient accumulation)\n",
         "print(\"Starting training...\")\n",
@@ -1263,7 +1174,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 44,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -1307,47 +1218,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "==================================================\n",
-            "Test Set Evaluation\n",
-            "==================================================\n",
-            "Running test evaluation with 5 steps (10 samples)\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/home/y-bornachot/venvs/ntv3-env/lib/python3.12/site-packages/torch/amp/autocast_mode.py:287: UserWarning: In CPU autocast, but the target dtype is not supported. Disabling autocast.\n",
-            "CPU Autocast only supports dtype of torch.bfloat16, torch.float16 currently.\n",
-            "  warnings.warn(error_message)\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "==================================================\n",
-            "Test Set Results\n",
-            "==================================================\n",
-            "\n",
-            "Scaled Metrics (scaled predictions vs scaled targets):\n",
-            "  Mean Pearson (scaled): -0.0362\n",
-            "    metrics_scaled/ENCFF884LDL/pearson: -0.0362\n",
-            "\n",
-            "Raw Metrics (raw predictions vs raw targets):\n",
-            "  Mean Pearson (raw): -0.0362\n",
-            "    metrics_raw/ENCFF884LDL/pearson: -0.0362\n",
-            "==================================================\n"
-          ]
-        }
-      ],
       "source": [
         "print(\"\\n\" + \"=\"*50)\n",
         "print(\"Test Set Evaluation\")\n",

       "outputs": [],
       "source": [
         "# Install useful dependencies\n",
+        "# !pip install pyBigWig\n",
+        "# !pip install pyfaidx\n",
+        "# !pip install torchmetrics"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 5,
       "metadata": {},
+      "outputs": [],
       "source": [
         "# 0. Imports\n",
         "import random\n",
         "import functools\n",
         "from typing import List, Dict, Optional, Callable\n",
+        "import os\n",
+        "import subprocess\n",
         "\n",
         "import torch\n",
         "import torch.nn as nn\n",
         "from torch.optim.lr_scheduler import LambdaLR\n",
         "from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer\n",
         "import numpy as np\n",
+        "import pyBigWig\n",
+        "from pyfaidx import Fasta\n",
         "from torchmetrics import PearsonCorrCoef"
       ]
     },
     },
     {
       "cell_type": "code",
+      "execution_count": 6,
       "metadata": {},
       "outputs": [
         {
         "config = {\n",
         "    # Model\n",
         "    \"model_name\": \"InstaDeepAI/ntv3_8M_7downsample_pretrained_le_1mb\",  # NTv3 model\n",
         "    \n",
         "    # Data\n",
+        "    \"data_cache_dir\": \"./data\",\n",
+        "    \"fasta_url\": \"https://hgdownload.gi.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz\",\n",
+        "    \"bigwig_url_list\": [\"https://www.encodeproject.org/files/ENCFF884LDL/@@download/ENCFF884LDL.bigWig\"],\n",
         "    \"sequence_length\": 1_024,\n",
         "    \"keep_target_center_fraction\": 0.375,\n",
         "    \n",
         "    # Training\n",
         "    \"num_workers\": 0,  # Number of worker processes for DataLoader\n",
         "}\n",
         "\n",
+        "os.makedirs(config[\"data_cache_dir\"], exist_ok=True)\n",
+        "\n",
+        "# Extract filenames from URLs\n",
+        "def extract_filename_from_url(url: str) -> str:\n",
+        "    \"\"\"Extract filename from URL, handling query parameters.\"\"\"\n",
+        "    # Remove query parameters if present\n",
+        "    url_clean = url.split('?')[0]\n",
+        "    # Get the last part of the URL path\n",
+        "    return url_clean.split('/')[-1]\n",
+        "\n",
+        "# Create paths for downloaded files\n",
+        "fasta_path = os.path.join(config[\"data_cache_dir\"], extract_filename_from_url(config[\"fasta_url\"]).replace('.gz', ''))\n",
+        "bigwig_path_list = [\n",
+        "    os.path.join(config[\"data_cache_dir\"], extract_filename_from_url(url))\n",
+        "    for url in config[\"bigwig_url_list\"]\n",
+        "]\n",
+        "\n",
+        "# Create bigwig_file_ids from filenames (without extension)\n",
+        "config[\"bigwig_file_ids\"] = [\n",
+        "    os.path.splitext(extract_filename_from_url(url))[0]\n",
+        "    for url in config[\"bigwig_url_list\"]\n",
+        "]\n",
+        "\n",
         "# Set random seed\n",
         "torch.manual_seed(config[\"seed\"])\n",
         "np.random.seed(config[\"seed\"])\n",
         "\n",
+        "# Set device\n",
         "device = torch.device(config[\"device\"])\n",
         "print(f\"Using device: {device}\")"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
       "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2025-12-10 14:47:06--  https://hgdownload.gi.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz\n",
+            "Resolving hgdownload.gi.ucsc.edu (hgdownload.gi.ucsc.edu)... 128.114.119.163\n",
+            "Connecting to hgdownload.gi.ucsc.edu (hgdownload.gi.ucsc.edu)|128.114.119.163|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 983659424 (938M) [application/x-gzip]\n",
+            "Saving to: './data/hg38.fa.gz'\n",
+            "\n",
+            "hg38.fa.gz          100%[===================>] 938.09M  10.4MB/s    in 1m 43s  \n",
+            "\n",
+            "2025-12-10 14:48:50 (9.09 MB/s) - './data/hg38.fa.gz' saved [983659424/983659424]\n",
+            "\n"
+          ]
+        }
+      ],
       "source": [
+        "# Download fasta file\n",
+        "!wget -c {config[\"fasta_url\"]} -P {config[\"data_cache_dir\"]}/ && gunzip -f {config[\"data_cache_dir\"]}/{config[\"fasta_url\"].split(os.path.sep)[-1]}"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 7,
       "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading ENCFF884LDL.bigWig...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "--2025-12-10 14:54:41--  https://www.encodeproject.org/files/ENCFF884LDL/@@download/ENCFF884LDL.bigWig\n",
+            "Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144\n",
+            "Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.\n",
+            "HTTP request sent, awaiting response... 307 Temporary Redirect\n",
+            "Location: https://encode-public.s3.amazonaws.com/2020/09/19/425880b6-b323-4ee2-95ce-56bdd088d126/ENCFF884LDL.bigWig?response-content-disposition=attachment%3B%20filename%3DENCFF884LDL.bigWig&AWSAccessKeyId=ASIATGZNGCNXU6SGJVOL&Signature=4o0Pp2RvJtnZc9z7HOuCU1k9wwI%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEA0aCXVzLXdlc3QtMiJGMEQCIEdyOOxtHk6rJT06xIjzZR3nVyqbPB1twIFxCDtIQfNXAiAph1lc69CfHzPPglodVnVh9QCjlsXHFyUEU3K0%2Bx%2F%2Bziq8BQjW%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDIyMDc0ODcxNDg2MyIMYwkeEaXuk%2BE48EDAKpAFkm4uzCSB40oRz3YT4m%2FZfBSH7XIuSCuzS7nrL5tXb9Q2rfPQSD4PHOyTR0LOOfcr98%2FyF8cJw4NE%2Fwsw8BRs4xPFEEyN6yGqwHmAyxBuwdca4GLSMGRDaSPoleMJw1FcSv96ofbZFYTTSol4b6%2FZj4jJjCa887%2F6S5x9kNIjTAtgX%2Fr3Ci4wi4FXGKTijTU%2FnbuuLZ3Cz2UobD6p732apsayl7avmUdWbUvROl3sHFOWOGCKsmDv0mavyEu2EsHxniBPfECy00BNvf%2Bj2FDaz1BImMIDavVBSwcWk8uCPjbsccsgiuKAfwr3dOXQ7R6y4NwmuFluBqn1GOXw1K13T4LrF%2BrhmqdOWeIVKB%2Bo9vnfQm1Dws6EoyS%2BG0bWDnyuUnLtWGf4cZPA6kjcM14fspFxoMnLjHBfdpYKZ3VmikbgwE8mDaiHODH1WQ36lUPigKbbIeHqOnHTIEw5h6F8D0MfIdVBSV2HCXweIlxCr6%2FV8hy2RzDouzT%2FIH%2FIobhHjGPM%2FlmkLAcfEzS2fioCJwkqQ3F%2BC77alAhtDQ4Oy5OIxRnRHVLpO%2BMA9Ml0SrEegCGPIzLucuCtbj2UTEOnBRQXyMolyySopJZb4p4BpJ6MiitLyCt1C66lvJpX5oMri%2BVD7FcTgdPYxcqM%2FMLD%2B4XqTYh5wdK7EYe3CpsVjpviZSVbn7yVHAb8WqdmFO%2BXRGhjQdN6rMrwGPiMCmQq12tTQftfmEwPGN1CVHG%2BbL1KUpEF4BRE61xDwEu7ZXyycPqTJMKHVn%2BXZ%2BxFsaxpUsp25U6JIVVPiNgt1OyhfjU6oqzwzeXH7KMRIcqz2d%2B3p%2BIbjRvoHcLc8AzgY4RvgWMGlb5gIpv15HQTDvdiLLwwjd3lyQY6sgE9t%2Bhi2Jv1DPgJN0YUGblcTV3Ey95h%2BBIXo6zWGwqhyZhkH%2ByxJKXouv2S1mKS3BM0dp2maJGDp69Mze8UkGjFYvdzxHT1zrCZ4dMRRkRObY3%2F4ZP33ogelhzchd7S76et35vYwYHd9DYycWZnJ%2FIcfpSZURGMJu3gLM3YhIscykGwQKqB21Tmyjufi0AaYyLk4w2OKc31kgjFvs6lNaHhqTuFButuHEiBUMzieixOI%2BX6&Expires=1765504482 [following]\n",
+            "--2025-12-10 14:54:42--  https://encode-public.s3.amazonaws.com/2020/09/19/425880b6-b323-4ee2-95ce-56bdd088d126/ENCFF884LDL.bigWig?response-content-disposition=attachment%3B%20filename%3DENCFF884LDL.bigWig&AWSAccessKeyId=ASIATGZNGCNXU6SGJVOL&Signature=4o0Pp2RvJtnZc9z7HOuCU1k9wwI%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEA0aCXVzLXdlc3QtMiJGMEQCIEdyOOxtHk6rJT06xIjzZR3nVyqbPB1twIFxCDtIQfNXAiAph1lc69CfHzPPglodVnVh9QCjlsXHFyUEU3K0%2Bx%2F%2Bziq8BQjW%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDIyMDc0ODcxNDg2MyIMYwkeEaXuk%2BE48EDAKpAFkm4uzCSB40oRz3YT4m%2FZfBSH7XIuSCuzS7nrL5tXb9Q2rfPQSD4PHOyTR0LOOfcr98%2FyF8cJw4NE%2Fwsw8BRs4xPFEEyN6yGqwHmAyxBuwdca4GLSMGRDaSPoleMJw1FcSv96ofbZFYTTSol4b6%2FZj4jJjCa887%2F6S5x9kNIjTAtgX%2Fr3Ci4wi4FXGKTijTU%2FnbuuLZ3Cz2UobD6p732apsayl7avmUdWbUvROl3sHFOWOGCKsmDv0mavyEu2EsHxniBPfECy00BNvf%2Bj2FDaz1BImMIDavVBSwcWk8uCPjbsccsgiuKAfwr3dOXQ7R6y4NwmuFluBqn1GOXw1K13T4LrF%2BrhmqdOWeIVKB%2Bo9vnfQm1Dws6EoyS%2BG0bWDnyuUnLtWGf4cZPA6kjcM14fspFxoMnLjHBfdpYKZ3VmikbgwE8mDaiHODH1WQ36lUPigKbbIeHqOnHTIEw5h6F8D0MfIdVBSV2HCXweIlxCr6%2FV8hy2RzDouzT%2FIH%2FIobhHjGPM%2FlmkLAcfEzS2fioCJwkqQ3F%2BC77alAhtDQ4Oy5OIxRnRHVLpO%2BMA9Ml0SrEegCGPIzLucuCtbj2UTEOnBRQXyMolyySopJZb4p4BpJ6MiitLyCt1C66lvJpX5oMri%2BVD7FcTgdPYxcqM%2FMLD%2B4XqTYh5wdK7EYe3CpsVjpviZSVbn7yVHAb8WqdmFO%2BXRGhjQdN6rMrwGPiMCmQq12tTQftfmEwPGN1CVHG%2BbL1KUpEF4BRE61xDwEu7ZXyycPqTJMKHVn%2BXZ%2BxFsaxpUsp25U6JIVVPiNgt1OyhfjU6oqzwzeXH7KMRIcqz2d%2B3p%2BIbjRvoHcLc8AzgY4RvgWMGlb5gIpv15HQTDvdiLLwwjd3lyQY6sgE9t%2Bhi2Jv1DPgJN0YUGblcTV3Ey95h%2BBIXo6zWGwqhyZhkH%2ByxJKXouv2S1mKS3BM0dp2maJGDp69Mze8UkGjFYvdzxHT1zrCZ4dMRRkRObY3%2F4ZP33ogelhzchd7S76et35vYwYHd9DYycWZnJ%2FIcfpSZURGMJu3gLM3YhIscykGwQKqB21Tmyjufi0AaYyLk4w2OKc31kgjFvs6lNaHhqTuFButuHEiBUMzieixOI%2BX6&Expires=1765504482\n",
+            "Resolving encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)... 52.92.248.169, 52.92.211.49, 3.5.80.18, ...\n",
+            "Connecting to encode-public.s3.amazonaws.com (encode-public.s3.amazonaws.com)|52.92.248.169|:443... connected.\n",
+            "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
+            "\n",
+            "    The file is already fully retrieved; nothing to do.\n",
+            "\n"
+          ]
+        }
+      ],
       "source": [
+        "# Download bigwig files\n",
+        "for bigwig_url in config[\"bigwig_url_list\"]:\n",
+        "    filename = extract_filename_from_url(bigwig_url)\n",
+        "    filepath = os.path.join(config[\"data_cache_dir\"], filename)\n",
+        "    print(f\"Downloading {filename}...\")\n",
+        "    subprocess.run([\"wget\", \"-c\", bigwig_url, \"-O\", filepath], check=True)"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 8,
       "metadata": {},
       "outputs": [],
       "source": [
         "chrom_splits = {\n",
+        "    \"train\": [f\"chr{i}\" for i in range(1, 21)] + ['chrX', 'chrY'],\n",
+        "    \"val\": ['chr22'],\n",
+        "    \"test\": ['chr21']\n",
         "}"
       ]
     },
     },
     {
       "cell_type": "code",
+      "execution_count": 11,
       "metadata": {},
       "outputs": [],
       "source": [
         "        model_name: str,\n",
         "        bigwig_track_names: List[str],\n",
         "        keep_target_center_fraction: float = 0.375,\n",
         "    ):\n",
         "        super().__init__()\n",
         "        \n",
         "        # Load config and model\n",
         "        self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\n",
+        "        self.backbone = AutoModelForMaskedLM.from_pretrained(\n",
+        "            model_name, \n",
+        "            trust_remote_code=True,\n",
+        "            config=self.config\n",
+        "        )\n",
         "        \n",
         "        self.keep_target_center_fraction = keep_target_center_fraction\n",
         "\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 12,
       "metadata": {},
       "outputs": [
         {
         "    model_name=config[\"model_name\"],\n",
         "    bigwig_track_names=config[\"bigwig_file_ids\"],\n",
         "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
         ")\n",
         "model = model.to(device)\n",
         "model.train()\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 17,
       "metadata": {},
       "outputs": [],
       "source": [
         "        sequence_length: int,\n",
         "        num_samples: int,\n",
         "        tokenizer: AutoTokenizer,\n",
         "        keep_target_center_fraction: float = 1.0,\n",
         "        num_tracks: int = 1,\n",
         "    ):\n",
         "        self.tokenizer = tokenizer\n",
         "        self.keep_target_center_fraction = keep_target_center_fraction\n",
         "        self.num_tracks = num_tracks\n",
         "        self.chroms = chroms\n",
         "\n",
         "        # Intersect lengths between FASTA and bigWig for safety\n",
         "        bw_chrom_lengths = self.bw_list[0].chroms()  # dict: chrom -> length\n",
         "        self.chrom_lengths = {}\n",
         "\n",
         "        for c in chroms:\n",
+        "            if c not in bw_chrom_lengths or c not in self.fasta:\n",
         "                continue\n",
         "\n",
+        "            fa_len = len(self.fasta[c])\n",
         "            bw_len = bw_chrom_lengths[c]\n",
         "            L = min(fa_len, bw_len)\n",
         "\n",
         "        start = random.randint(0, max_start)\n",
         "        end = start + self.sequence_length\n",
         "\n",
         "        # Sequence\n",
+        "        seq = self.fasta[chrom][start:end]  # string slice\n",
         "        tokens = self.tokenizer(\n",
         "            seq,\n",
         "            return_tensors=\"pt\",  # Returns a dict of PyTorch tensors\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 18,
       "metadata": {},
       "outputs": [
         {
         }
       ],
       "source": [
         "create_dataset_fn = functools.partial(\n",
         "    GenomeBigWigDataset,\n",
         "    fasta_path=fasta_path,\n",
         "    bigwig_path_list=bigwig_path_list,\n",
         "    sequence_length=config[\"sequence_length\"],\n",
         "    tokenizer=tokenizer,\n",
         "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
         "    num_tracks=len(config[\"bigwig_file_ids\"]),\n",
         ")\n",
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "# Calculate gradient accumulation steps and effective batch size\n",
         "num_devices = 1  # Single device for now\n",
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "def get_track_means(bigwig_file_ids: List[str]) -> np.ndarray:\n",
         "    \"\"\"\n",
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "# Training loop (step-based with gradient accumulation)\n",
         "print(\"Starting training...\")\n",
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "print(\"\\n\" + \"=\"*50)\n",
         "print(\"Test Set Evaluation\")\n",