Spaces:

InstaDeepAI
/

ntv3

Running

App Files Files Community

ybornachot commited on Dec 19, 2025

Commit

2b05bdb

1 Parent(s): 6db01cc

feat: made compat with HF dataset + refactor

Browse files

Files changed (1) hide show

notebooks/03_fine_tuning.ipynb +212 -339

notebooks/03_fine_tuning.ipynb CHANGED Viewed

@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,8 +60,9 @@
     "import functools\n",
     "from typing import List, Dict, Callable\n",
     "import os\n",
-    "import subprocess\n",
-    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
     "\n",
     "import torch\n",
     "import torch.nn as nn\n",
@@ -69,6 +70,8 @@
     "from torch.utils.data import Dataset, DataLoader\n",
     "from torch.optim import AdamW\n",
     "from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer\n",
     "import numpy as np\n",
     "import pyBigWig\n",
     "from pyfaidx import Fasta\n",
@@ -90,10 +93,9 @@
     "- **`model_name`**: HuggingFace model name/identifier for the pretrained backbone model\n",
     "\n",
     "### Data\n",
     "- **`data_cache_dir`**: Directory where downloaded data files (FASTA, bigWig) will be stored\n",
-    "- **`fasta_url`**: URL to download reference genome FASTA file\n",
-    "- **`bigwig_url_list`**: List of URLs for bigWig track files to download\n",
-    "- **`bigwig_file_ids`**: List of identifiers/names for bigWig tracks (set after downloading, used for model head and metrics)\n",
     "- **`sequence_length`**: Length of input sequences in base pairs (bp)\n",
     "- **`keep_target_center_fraction`**: Fraction of center sequence to keep for target prediction (crops edges to focus on center)\n",
     "\n",
@@ -108,6 +110,9 @@
     "- **`validate_every_n_steps`**: Run validation every N steps\n",
     "- **`num_validation_samples`**: Number of samples to use for validation set\n",
     "\n",
     "### General\n",
     "- **`seed`**: Random seed for reproducibility\n",
     "- **`device`**: Device to run training on (\"cuda\" or \"cpu\")\n",
@@ -116,31 +121,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Using device: cpu\n"
-     ]
-    }
-   ],
    "source": [
     "config = {\n",
     "    # Model\n",
     "    \"model_name\": \"InstaDeepAI/NTv3_8M_pre\",\n",
     "    \n",
     "    # Data\n",
     "    \"data_cache_dir\": \"./data\",\n",
-    "    \"fasta_url\": \"https://hgdownload.gi.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz\",\n",
-    "    \"bigwig_url_list\": [\n",
-    "        \"https://www.encodeproject.org/files/ENCFF055QKS/@@download/ENCFF055QKS.bigWig\",\n",
-    "        \"https://www.encodeproject.org/files/ENCFF214GOQ/@@download/ENCFF214GOQ.bigWig\",\n",
-    "        \"https://www.encodeproject.org/files/ENCFF592NIB/@@download/ENCFF592NIB.bigWig\",\n",
-    "        \"https://www.encodeproject.org/files/ENCFF921PHQ/@@download/ENCFF921PHQ.bigWig\",\n",
-    "    ],\n",
     "    \"sequence_length\": 32_768,\n",
     "    \"keep_target_center_fraction\": 0.375,\n",
     "    \n",
@@ -159,40 +151,11 @@
     "    \"num_test_samples\": 10000,\n",
     "    \n",
     "    # General\n",
-    "    \"seed\": 17,\n",
     "    \"device\": \"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
     "    \"num_workers\": 16,\n",
     "}\n",
     "\n",
-    "os.makedirs(config[\"data_cache_dir\"], exist_ok=True)\n",
-    "\n",
-    "# Extract filenames from URLs\n",
-    "def extract_filename_from_url(url: str) -> str:\n",
-    "    \"\"\"Extract filename from URL, handling query parameters.\"\"\"\n",
-    "    # Remove query parameters if present\n",
-    "    url_clean = url.split('?')[0]\n",
-    "    # Get the last part of the URL path\n",
-    "    return url_clean.split('/')[-1]\n",
-    "\n",
-    "# Create paths for downloaded files\n",
-    "fasta_path = os.path.join(config[\"data_cache_dir\"], extract_filename_from_url(config[\"fasta_url\"]).replace('.gz', ''))\n",
-    "bigwig_path_list = [\n",
-    "    os.path.join(config[\"data_cache_dir\"], extract_filename_from_url(url))\n",
-    "    for url in config[\"bigwig_url_list\"]\n",
-    "]\n",
-    "\n",
-    "\n",
-    "# TODO: find a way to link the experiment accession to bigwig file ids\n",
-    "# Create bigwig_file_ids from filenames (without extension)\n",
-    "config[\"bigwig_file_ids\"] = [\n",
-    "    # os.path.splitext(extract_filename_from_url(url))[0]\n",
-    "    # for url in config[\"bigwig_url_list\"]\n",
-    "    \"ENCSR325NFE\",\n",
-    "    \"ENCSR962OTG\",\n",
-    "    \"ENCSR619DQO_P\",\n",
-    "    \"ENCSR619DQO_M\",\n",
-    "]\n",
-    "\n",
     "# Set random seed\n",
     "torch.manual_seed(config[\"seed\"])\n",
     "np.random.seed(config[\"seed\"])\n",
@@ -217,56 +180,99 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def _download_file(url: str, output_path: str) -> None:\n",
-    "    \"\"\"Download a file from URL to output_path using wget.\"\"\"\n",
-    "    subprocess.run([\"wget\", \"-c\", url, \"-O\", output_path], check=True)\n",
-    "\n",
-    "# Prepare download tasks: (url, output_path)\n",
-    "download_tasks = []\n",
-    "\n",
-    "# FASTA file\n",
-    "fasta_filename = extract_filename_from_url(config[\"fasta_url\"])\n",
-    "fasta_gz_path = os.path.join(config[\"data_cache_dir\"], fasta_filename)\n",
-    "download_tasks.append((config[\"fasta_url\"], fasta_gz_path))\n",
-    "\n",
-    "# BigWig files\n",
-    "for bigwig_url in config[\"bigwig_url_list\"]:\n",
-    "    filename = extract_filename_from_url(bigwig_url)\n",
-    "    filepath = os.path.join(config[\"data_cache_dir\"], filename)\n",
-    "    download_tasks.append((bigwig_url, filepath))\n",
-    "\n",
-    "# Download files in parallel\n",
-    "max_workers = min(len(download_tasks), 8)\n",
-    "\n",
-    "print(f\"Downloading {len(download_tasks)} files using {max_workers} workers...\")\n",
-    "with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
-    "    # Submit all download tasks\n",
-    "    future_to_path = {\n",
-    "        executor.submit(_download_file, url, path): path\n",
-    "        for url, path in download_tasks\n",
-    "    }\n",
     "    \n",
-    "    # Wait for all downloads to complete\n",
-    "    for future in as_completed(future_to_path):\n",
-    "        try:\n",
-    "            future.result()  # Raises exception if download failed\n",
-    "            path = future_to_path[future]\n",
-    "            print(f\"✓ Downloaded: {os.path.basename(path)}\")\n",
-    "        except Exception as e:\n",
-    "            path = future_to_path[future]\n",
-    "            raise RuntimeError(f\"Failed to download {path}: {e}\") from e\n",
-    "\n",
-    "# Extract FASTA file after download\n",
-    "print(f\"\\nExtracting {fasta_filename}...\")\n",
-    "subprocess.run([\"gunzip\", \"-f\", fasta_gz_path], check=True)\n",
-    "print(\"✓ Extraction complete\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Data Splits Definition"
    ]
   },
   {
@@ -275,11 +281,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "chrom_splits = {\n",
-    "    \"train\": [f\"chr{i}\" for i in range(1, 21)] + ['chrX', 'chrY'],\n",
-    "    \"val\": ['chr22'],\n",
-    "    \"test\": ['chr21']\n",
-    "}"
    ]
   },
   {
@@ -335,7 +350,7 @@
     "        self.backbone = AutoModelForMaskedLM.from_pretrained(\n",
     "            model_name, \n",
     "            trust_remote_code=True,\n",
-    "            config=self.config\n",
     "        )\n",
     "        \n",
     "        self.keep_target_center_fraction = keep_target_center_fraction\n",
@@ -351,7 +366,7 @@
     "    \n",
     "    def forward(self, tokens: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:\n",
     "        # Forward through backbone\n",
-    "        outputs = self.backbone(input_ids=tokens)\n",
     "        embedding = outputs.hidden_states[-1]  # Last hidden state\n",
     "        \n",
     "        # Crop to center fraction\n",
@@ -379,14 +394,14 @@
     "# Create model\n",
     "model = HFModelWithHead(\n",
     "    model_name=config[\"model_name\"],\n",
-    "    bigwig_track_names=config[\"bigwig_file_ids\"],\n",
     "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
     ")\n",
     "model = model.to(device)\n",
     "model.train()\n",
     "\n",
     "print(f\"Model loaded: {config['model_name']}\")\n",
-    "print(f\"Number of bigwig tracks: {len(config['bigwig_file_ids'])}\")\n",
     "print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")"
    ]
   },
@@ -426,8 +441,8 @@
     "    Random genomic windows from a reference genome + bigWig signal.\n",
     "\n",
     "    Each sample:\n",
-    "        - picks a chromosome/region (from `chroms` or `regions`),\n",
-    "        - picks a random window of length `sequence_length`,\n",
     "        - returns (sequence, signal, chrom, start, end).\n",
     "\n",
     "    This dataset is compatible with multi-worker DataLoaders. BigWig files\n",
@@ -438,11 +453,13 @@
     "    ----\n",
     "    fasta_path : str\n",
     "        Path to the reference genome FASTA (e.g. hg38.fna).\n",
-    "    bigwig_path_list : str\n",
-    "        Path to the bigWig file (e.g. ENCFF884LDL.bigWig).\n",
-    "    chroms : List[str]\n",
-    "        Chromosome names as they appear in the bigWig (e.g. [\"chr1\", \"chr2\", ...]).\n",
-    "        Used for backward compatibility or when regions=None.\n",
     "    sequence_length : int\n",
     "        Length of each random window (in bp).\n",
     "    num_samples : int\n",
@@ -453,18 +470,14 @@
     "        Function to transform/scaling bigwig targets.\n",
     "    keep_target_center_fraction : float\n",
     "        Fraction of center sequence to keep for target prediction (crops edges to focus on center).\n",
-    "    regions : List[tuple[str, int, int]] | None\n",
-    "        Optional list of regions as (chromosome, start, end) tuples.\n",
-    "        If provided, samples are drawn randomly from within these regions only.\n",
-    "        This matches the JAX pipeline approach using BED file splits.\n",
-    "        If None, samples from entire chromosomes in `chroms`.\n",
     "    \"\"\"\n",
     "\n",
     "    def __init__(\n",
     "        self,\n",
     "        fasta_path: str,\n",
     "        bigwig_path_list: list[str],\n",
-    "        chroms: List[str],\n",
     "        sequence_length: int,\n",
     "        num_samples: int,\n",
     "        tokenizer: AutoTokenizer,\n",
@@ -479,43 +492,37 @@
     "        self.sequence_length = sequence_length\n",
     "        self.num_samples = num_samples\n",
     "        self.tokenizer = tokenizer\n",
-    "        self.transform_fn = transform_fn  # Use pre-computed transform function\n",
     "        self.keep_target_center_fraction = keep_target_center_fraction\n",
-    "        self.chroms = chroms\n",
     "\n",
-    "        # Get chromosome lengths from first BigWig file (lazy, cached per process)\n",
-    "        # We need this for validation, so open temporarily\n",
-    "        bw_handle = _get_bigwig_handle(bigwig_path_list[0])\n",
-    "        bw_chrom_lengths = bw_handle.chroms()  # dict: chrom -> length\n",
     "\n",
-    "        self.valid_chroms = []\n",
-    "        self.chrom_lengths = {}\n",
     "\n",
-    "        for c in chroms:\n",
-    "            if c not in bw_chrom_lengths or c not in self.fasta:\n",
     "                continue\n",
     "\n",
-    "            fa_len = len(self.fasta[c])\n",
-    "            bw_len = bw_chrom_lengths[c]\n",
-    "            L = min(fa_len, bw_len)\n",
-    "\n",
-    "            if L > self.sequence_length:\n",
-    "                self.valid_chroms.append(c)\n",
-    "                self.chrom_lengths[c] = L\n",
-    "\n",
-    "        if not self.valid_chroms:\n",
-    "            raise ValueError(\"No valid chromosomes after intersecting FASTA and bigWig.\")\n",
     "\n",
     "    def __len__(self):\n",
     "        return self.num_samples\n",
     "\n",
     "    def __getitem__(self, idx):\n",
-    "\n",
-    "        # Sample from entire chromosomes\n",
-    "        chrom = random.choice(self.valid_chroms)\n",
-    "        chrom_len = self.chrom_lengths[chrom]\n",
-    "        max_start = chrom_len - self.sequence_length\n",
-    "        start = random.randint(0, max_start)\n",
     "        end = start + self.sequence_length\n",
     "\n",
     "        # Sequence\n",
@@ -575,133 +582,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Scaling functions for targets\n",
-    "def compute_chromosome_stats(track_data: np.ndarray) -> dict:\n",
-    "    \"\"\"\n",
-    "    Compute minimal statistics needed for weighted mean computation.\n",
-    "    \n",
-    "    Args:\n",
-    "        track_data: numpy array of track values for a chromosome\n",
-    "        \n",
-    "    Returns:\n",
-    "        Dictionary with statistics: sum, mean, total_count\n",
     "    \"\"\"\n",
-    "    track_data = track_data.astype(np.float32)\n",
-    "    \n",
-    "    # Compute statistics\n",
-    "    sum_all = np.sum(track_data)\n",
-    "    total_count = track_data.size\n",
-    "    mean_all = sum_all / total_count if total_count > 0 else 0.0\n",
-    "    \n",
-    "    return {\n",
-    "        \"sum\": sum_all,\n",
-    "        \"mean\": mean_all,\n",
-    "        \"total_count\": total_count,\n",
-    "    }\n",
     "\n",
-    "\n",
-    "def aggregate_file_statistics(chr_stats_list: List[dict]) -> dict:\n",
-    "    \"\"\"\n",
-    "    Aggregate chromosome-level statistics into file-level statistics.\n",
-    "    \n",
     "    Args:\n",
-    "        chr_stats_list: List of dictionaries, each containing chromosome-level statistics\n",
-    "        \n",
-    "    Returns:\n",
-    "        Dictionary with aggregated file-level statistics (only mean)\n",
-    "    \"\"\"\n",
-    "    # Convert to arrays for easier computation\n",
-    "    total_counts = np.array([s[\"total_count\"] for s in chr_stats_list], dtype=np.int64)\n",
-    "    means = np.array([s[\"mean\"] for s in chr_stats_list], dtype=np.float32)\n",
-    "    sums = np.array([s[\"sum\"] for s in chr_stats_list], dtype=np.float32)\n",
-    "    \n",
-    "    # Aggregate total count\n",
-    "    total_count = np.sum(total_counts)\n",
-    "    \n",
-    "    # Weighted mean: mean = sum(mean_chr * total_count_chr) / sum(total_count_chr)\n",
-    "    mean = np.sum(means * total_counts) / total_count if total_count > 0 else 0.0\n",
-    "    \n",
-    "    return {\n",
-    "        \"total_count\": total_count,\n",
-    "        \"sum\": np.sum(sums),\n",
-    "        \"mean\": mean,\n",
-    "    }\n",
-    "\n",
     "\n",
-    "def get_track_means(bigwig_tracks_list: List[pyBigWig.pyBigWig]) -> np.ndarray:\n",
-    "    \"\"\"\n",
-    "    Get track means for normalization.\n",
-    "    Computes statistics per chromosome and aggregates using weighted averaging,\n",
-    "    \n",
-    "    Args:\n",
-    "        bigwig_tracks_list: List of pyBigWig file objects\n",
-    "        \n",
-    "    Returns:\n",
-    "        Array of track means, one per bigwig file\n",
-    "    \"\"\"\n",
-    "    track_means = []\n",
-    "    \n",
-    "    for bigwig_track in bigwig_tracks_list:\n",
-    "        chrom_lengths = bigwig_track.chroms()\n",
-    "        all_chr_stats = []\n",
-    "        \n",
-    "        # Compute statistics for each chromosome\n",
-    "        for chrom_name, chrom_length in chrom_lengths.items():\n",
-    "            try:\n",
-    "                # Get chromosome data as numpy array\n",
-    "                bw_array = np.array(\n",
-    "                    bigwig_track.values(chrom_name, 0, chrom_length, numpy=True),\n",
-    "                    dtype=np.float32\n",
-    "                )\n",
-    "                # Replace NaN with 0\n",
-    "                bw_array = np.nan_to_num(bw_array, nan=0.0)\n",
-    "                \n",
-    "                # Compute chromosome-level statistics\n",
-    "                chr_stats = compute_chromosome_stats(bw_array)\n",
-    "                all_chr_stats.append(chr_stats)\n",
-    "            except Exception as e:\n",
-    "                # Skip chromosomes that fail to load\n",
-    "                print(f\"Warning: Failed to load chromosome {chrom_name}: {e}\")\n",
-    "                continue\n",
-    "        \n",
-    "        if not all_chr_stats:\n",
-    "            raise ValueError(f\"No valid chromosomes found for bigwig track\")\n",
-    "        \n",
-    "        # Aggregate chromosome-level stats into file-level stats\n",
-    "        file_stats = aggregate_file_statistics(all_chr_stats)\n",
-    "        \n",
-    "        # Use the weighted mean for normalization\n",
-    "        track_means.append(file_stats[\"mean\"])\n",
-    "    \n",
-    "    return np.array(track_means, dtype=np.float32)\n",
-    "\n",
-    "\n",
-    "def create_targets_scaling_fn(bigwig_path_list: List[str]) -> Callable[[torch.Tensor], torch.Tensor]:\n",
-    "    \"\"\"\n",
-    "    Build a scaling function based on track means computed from bigwig files.\n",
-    "    \n",
-    "    Opens bigwig files, computes track statistics, and creates a transform function.\n",
-    "    The statistics are computed once and reused for all calls to the returned transform function.\n",
-    "    \n",
-    "    Args:\n",
-    "        bigwig_path_list: List of paths to bigwig files\n",
-    "        \n",
     "    Returns:\n",
     "        Transform function that scales input tensors\n",
     "    \"\"\"\n",
     "    # Open bigwig files and compute track statistics\n",
-    "    print(\"Computing track statistics (this may take a while)...\")\n",
-    "    bw_list = [\n",
-    "        pyBigWig.open(bigwig_path)\n",
-    "        for bigwig_path in bigwig_path_list\n",
-    "    ]\n",
-    "    track_means = get_track_means(bw_list)\n",
-    "    print(f\"Computed track means: {track_means}\")\n",
-    "    print(f\"Track means shape: {track_means.shape}\")\n",
-    "    \n",
     "    # Create tensor from computed means\n",
     "    track_means_tensor = torch.tensor(track_means, dtype=torch.float32)\n",
-    "    \n",
     "    def transform_fn(x: torch.Tensor) -> torch.Tensor:\n",
     "        \"\"\"\n",
     "        x: torch.Tensor, shape (seq_len, num_tracks) or (batch, seq_len, num_tracks)\n",
@@ -717,20 +617,10 @@
     "            scaled,\n",
     "        )\n",
     "        return clipped\n",
-    "    \n",
     "    return transform_fn"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create scaling function\n",
-    "targets_transform_fn = create_targets_scaling_fn(bigwig_path_list)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -741,25 +631,26 @@
     "create_dataset_fn = functools.partial(\n",
     "    GenomeBigWigDataset,\n",
     "    fasta_path=fasta_path,\n",
-    "    bigwig_path_list=bigwig_path_list,\n",
     "    sequence_length=config[\"sequence_length\"],\n",
     "    tokenizer=tokenizer,\n",
-    "    transform_fn=targets_transform_fn,\n",
     "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
     ")\n",
     "\n",
     "train_dataset = create_dataset_fn(\n",
-    "    chroms=chrom_splits[\"train\"],\n",
     "    num_samples=config[\"num_steps_training\"] * config[\"batch_size\"],\n",
     ")\n",
     "\n",
     "val_dataset = create_dataset_fn(\n",
-    "    chroms=chrom_splits[\"val\"],\n",
     "    num_samples=config[\"num_validation_samples\"],\n",
     ")\n",
     "\n",
     "test_dataset = create_dataset_fn(\n",
-    "    chroms=chrom_splits[\"test\"],\n",
     "    num_samples=config[\"num_test_samples\"],\n",
     ")\n",
     "\n",
@@ -785,7 +676,7 @@
     "    num_workers=config[\"num_workers\"],\n",
     ")\n",
     "\n",
-    "print(f\"Train samples: {len(train_dataset)}\")\n",
     "print(f\"Val samples: {len(val_dataset)}\")\n",
     "print(f\"Test samples: {len(test_dataset)}\")"
    ]
@@ -912,9 +803,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_metrics = TracksMetrics(config[\"bigwig_file_ids\"])\n",
-    "val_metrics = TracksMetrics(config[\"bigwig_file_ids\"])\n",
-    "test_metrics = TracksMetrics(config[\"bigwig_file_ids\"])"
    ]
   },
   {
@@ -1098,47 +989,6 @@
     "val_losses = []\n",
     "val_pearson_scores = []\n",
     "\n",
-    "# Initialize interactive plots using FigureWidget for real-time updates\n",
-    "from plotly.graph_objects import FigureWidget\n",
-    "from plotly.subplots import make_subplots\n",
-    "\n",
-    "# Create base figure with subplots\n",
-    "fig_base = make_subplots(\n",
-    "    rows=1, cols=2,\n",
-    "    subplot_titles=('Loss', 'Mean Pearson Correlation'),\n",
-    "    horizontal_spacing=0.15,\n",
-    ")\n",
-    "\n",
-    "# Add empty traces for train and val metrics\n",
-    "fig_base.add_trace(\n",
-    "    go.Scatter(x=[], y=[], mode='lines+markers', name='Train Loss', line=dict(color='blue')),\n",
-    "    row=1, col=1\n",
-    ")\n",
-    "fig_base.add_trace(\n",
-    "    go.Scatter(x=[], y=[], mode='lines+markers', name='Val Loss', line=dict(color='red')),\n",
-    "    row=1, col=1\n",
-    ")\n",
-    "fig_base.add_trace(\n",
-    "    go.Scatter(x=[], y=[], mode='lines+markers', name='Train Pearson', line=dict(color='green')),\n",
-    "    row=1, col=2\n",
-    ")\n",
-    "fig_base.add_trace(\n",
-    "    go.Scatter(x=[], y=[], mode='lines+markers', name='Val Pearson', line=dict(color='orange')),\n",
-    "    row=1, col=2\n",
-    ")\n",
-    "\n",
-    "fig_base.update_xaxes(title_text=\"Step\", row=1, col=1)\n",
-    "fig_base.update_xaxes(title_text=\"Step\", row=1, col=2)\n",
-    "fig_base.update_yaxes(title_text=\"Loss\", row=1, col=1)\n",
-    "fig_base.update_yaxes(title_text=\"Pearson Correlation\", row=1, col=2)\n",
-    "fig_base.update_layout(height=800, width=1600, showlegend=True)\n",
-    "\n",
-    "# Convert to FigureWidget for interactive updates\n",
-    "fig = FigureWidget(fig_base)\n",
-    "\n",
-    "# Display initial plot (will update in place during training)\n",
-    "display(fig)\n",
-    "\n",
     "# Create iterator for training data (will cycle if needed)\n",
     "train_iter = iter(train_loader)\n",
     "\n",
@@ -1183,11 +1033,6 @@
     "        train_losses.append(mean_loss)\n",
     "        train_pearson_scores.append(train_metrics_dict['mean/pearson'])\n",
     "        \n",
-    "        # Update plots - direct assignment to FigureWidget data updates the plot automatically\n",
-    "        fig.data[0].x = train_steps\n",
-    "        fig.data[0].y = train_losses\n",
-    "        fig.data[2].x = train_steps\n",
-    "        fig.data[2].y = train_pearson_scores\n",
     "        \n",
     "        print(\n",
     "            f\"Step {step_idx + 1}/{config['num_steps_training']} | \"\n",
@@ -1215,11 +1060,6 @@
     "        val_losses.append(val_metrics_dict['loss'])\n",
     "        val_pearson_scores.append(val_pearson_mean)\n",
     "        \n",
-    "        # Update plots with validation data - direct assignment updates the plot automatically\n",
-    "        fig.data[1].x = val_steps\n",
-    "        fig.data[1].y = val_losses\n",
-    "        fig.data[3].x = val_steps\n",
-    "        fig.data[3].y = val_pearson_scores\n",
     "        \n",
     "        print(f\"  Validation Loss: {val_metrics_dict['loss']:.4f}\")\n",
     "        print(f\"  Validation Mean Pearson: {val_pearson_mean:.4f}\")\n",
@@ -1228,7 +1068,40 @@
     "        \n",
     "        model.train()  # Back to training mode\n",
     "\n",
-    "print(f\"\\nTraining completed after {config['num_steps_training']} steps.\")"
    ]
   },
   {

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import functools\n",
     "from typing import List, Dict, Callable\n",
     "import os\n",
+    "import fnmatch\n",
+    "from pathlib import Path\n",
+    "from huggingface_hub import HfApi, snapshot_download\n",
     "\n",
     "import torch\n",
     "import torch.nn as nn\n",
     "from torch.utils.data import Dataset, DataLoader\n",
     "from torch.optim import AdamW\n",
     "from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pyBigWig\n",
     "from pyfaidx import Fasta\n",
     "- **`model_name`**: HuggingFace model name/identifier for the pretrained backbone model\n",
     "\n",
     "### Data\n",
+    "- **`hf_repo_id`**: HuggingFace dataset repository ID containing the benchmark data\n",
+    "- **`species`**: Species name (e.g., \"human\") to select data from the benchmark dataset\n",
     "- **`data_cache_dir`**: Directory where downloaded data files (FASTA, bigWig) will be stored\n",
     "- **`sequence_length`**: Length of input sequences in base pairs (bp)\n",
     "- **`keep_target_center_fraction`**: Fraction of center sequence to keep for target prediction (crops edges to focus on center)\n",
     "\n",
     "- **`validate_every_n_steps`**: Run validation every N steps\n",
     "- **`num_validation_samples`**: Number of samples to use for validation set\n",
     "\n",
+    "### Test\n",
+    "- **`num_test_samples`**: Number of samples to use for test set evaluation\n",
+    "\n",
     "### General\n",
     "- **`seed`**: Random seed for reproducibility\n",
     "- **`device`**: Device to run training on (\"cuda\" or \"cpu\")\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "config = {\n",
     "    # Model\n",
     "    \"model_name\": \"InstaDeepAI/NTv3_8M_pre\",\n",
     "    \n",
     "    # Data\n",
+    "    \"hf_repo_id\": \"InstaDeepAI/NTv3_benchmark_dataset\",\n",
+    "    \"species\": \"arabidopsis\",\n",
     "    \"data_cache_dir\": \"./data\",\n",
     "    \"sequence_length\": 32_768,\n",
     "    \"keep_target_center_fraction\": 0.375,\n",
     "    \n",
     "    \"num_test_samples\": 10000,\n",
     "    \n",
     "    # General\n",
+    "    \"seed\": 0,\n",
     "    \"device\": \"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
     "    \"num_workers\": 16,\n",
     "}\n",
     "\n",
     "# Set random seed\n",
     "torch.manual_seed(config[\"seed\"])\n",
     "np.random.seed(config[\"seed\"])\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "def prepare_genomics_inputs(\n",
+    "    species: str,\n",
+    "    data_cache_dir: str | Path = \"data\",\n",
+    "    hf_repo_id: str = \"InstaDeepAI/NTv3_benchmark_dataset\",\n",
+    ") -> tuple[str, list[str], list[str]]:\n",
+    "    \"\"\"\n",
+    "    Downloads:\n",
+    "      1) FASTA from HF dataset under: <species>/genome.fasta\n",
+    "      2) BigWigs from HF dataset under: <species>/functional_tracks/**\n",
+    "      3) Splits from HF dataset under: <species>/splits.bed\n",
+    "      4) Metadata from HF dataset under: benchmark_metadata.tsv\n",
+    "    Returns:\n",
+    "      (fasta_path, bigwig_path_list, bigwig_file_ids)\n",
+    "    \"\"\"\n",
+    "    cache = Path(data_cache_dir).expanduser().resolve()\n",
+    "    cache.mkdir(parents=True, exist_ok=True)\n",
     "    \n",
+    "    # --- Download metadata + <species> files (FASTA, BigWigs, Splits) ---\n",
+    "    api = HfApi()\n",
+    "    files = api.list_repo_files(repo_id=hf_repo_id, repo_type=\"dataset\")\n",
+    "    \n",
+    "    # Find all files to download: species directory + metadata at root\n",
+    "    species_pattern = f\"{species}/**\"\n",
+    "    metadata_file = \"benchmark_metadata.tsv\"\n",
+    "    \n",
+    "    species_files = [p for p in files if fnmatch.fnmatch(p, species_pattern)]\n",
+    "    if not species_files:\n",
+    "        raise ValueError(f\"No files found matching '{species_pattern}' in '{hf_repo_id}'\")\n",
+    "    \n",
+    "    if metadata_file not in files:\n",
+    "        raise ValueError(f\"No metadata file found at '{metadata_file}' in '{hf_repo_id}'\")\n",
+    "    \n",
+    "    # Download all needed files\n",
+    "    download_patterns = [species_pattern, metadata_file]\n",
+    "    local_dir = Path(\n",
+    "        snapshot_download(\n",
+    "            repo_id=hf_repo_id,\n",
+    "            repo_type=\"dataset\",\n",
+    "            allow_patterns=download_patterns,\n",
+    "            local_dir=str(cache),\n",
+    "        )\n",
+    "    )\n",
+    "    \n",
+    "    # --- Organize outputs ---\n",
+    "    # FASTA file\n",
+    "    fasta_path_repo = f\"{species}/genome.fasta\"\n",
+    "    fasta_path = str(local_dir / fasta_path_repo)\n",
+    "    if not Path(fasta_path).is_file():\n",
+    "        raise ValueError(f\"FASTA file not found at '{fasta_path}'\")\n",
+    "    \n",
+    "    # BigWig files\n",
+    "    bigwig_paths, bigwig_ids = [], []\n",
+    "    for repo_path in species_files:\n",
+    "        lp = local_dir / repo_path\n",
+    "        if lp.is_file() and lp.suffix == \".bigwig\":\n",
+    "            bigwig_paths.append(str(lp))\n",
+    "            bigwig_ids.append(lp.stem)\n",
+    "    if not bigwig_paths:\n",
+    "        raise ValueError(f\"Found no BigWig files in '{species_pattern}'\")\n",
+    "    \n",
+    "    # Splits file\n",
+    "    splits_path_repo = f\"{species}/splits.bed\"\n",
+    "    splits_path = local_dir / splits_path_repo\n",
+    "    if not splits_path.is_file():\n",
+    "        raise ValueError(f\"Splits file not found at '{splits_path}'\")\n",
+    "    splits_df = pd.read_csv(\n",
+    "        splits_path, \n",
+    "        sep=\"\\t\", \n",
+    "        header=None, \n",
+    "        names=[\"chr_name\", \"start\", \"end\", \"split\"],\n",
+    "        dtype={\"chr_name\": str, \"start\": int, \"end\": int, \"split\": str},\n",
+    "    )\n",
+    "    \n",
+    "    # Metadata file\n",
+    "    metadata_path = local_dir / metadata_file\n",
+    "    if not metadata_path.is_file():\n",
+    "        raise ValueError(f\"Metadata file not found at '{metadata_path}'\")\n",
+    "    metadata_df = pd.read_csv(metadata_path, sep=\"\\t\")\n",
+    "\n",
+    "    if \"species\" not in metadata_df.columns:\n",
+    "        raise ValueError(\"benchmark_metadata.tsv has no 'species' column\")\n",
+    "\n",
+    "    # Filter metadata according to species\n",
+    "    metadata_df = metadata_df[metadata_df[\"species\"] == species].reset_index(drop=True)\n",
+    "\n",
+    "    # Order metadata according to bigwig file ids\n",
+    "    metadata_df = (\n",
+    "      metadata_df.set_index(\"file_id\")\n",
+    "        .loc[bigwig_ids]\n",
+    "        .reset_index()\n",
+    "    )\n",
+    "\n",
+    "    return fasta_path, bigwig_paths, bigwig_ids, splits_df, metadata_df"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "os.makedirs(config[\"data_cache_dir\"], exist_ok=True)\n",
+    "\n",
+    "# Download all species files + load the splits, and metadata\n",
+    "(\n",
+    "    fasta_path, \n",
+    "    bigwig_paths, \n",
+    "    bigwig_ids, \n",
+    "    species_splits_df,\n",
+    "    metadata_df \n",
+    ") = prepare_genomics_inputs(\n",
+    "    config[\"species\"], \n",
+    "    config[\"data_cache_dir\"], \n",
+    "    config[\"hf_repo_id\"]\n",
+    ")"
    ]
   },
   {
     "        self.backbone = AutoModelForMaskedLM.from_pretrained(\n",
     "            model_name, \n",
     "            trust_remote_code=True,\n",
+    "            config=self.config,\n",
     "        )\n",
     "        \n",
     "        self.keep_target_center_fraction = keep_target_center_fraction\n",
     "    \n",
     "    def forward(self, tokens: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:\n",
     "        # Forward through backbone\n",
+    "        outputs = self.backbone(input_ids=tokens, output_hidden_states=True)\n",
     "        embedding = outputs.hidden_states[-1]  # Last hidden state\n",
     "        \n",
     "        # Crop to center fraction\n",
     "# Create model\n",
     "model = HFModelWithHead(\n",
     "    model_name=config[\"model_name\"],\n",
+    "    bigwig_track_names=bigwig_ids,\n",
     "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
     ")\n",
     "model = model.to(device)\n",
     "model.train()\n",
     "\n",
     "print(f\"Model loaded: {config['model_name']}\")\n",
+    "print(f\"Number of bigwig tracks: {len(bigwig_ids)}\")\n",
     "print(f\"Model parameters: {sum(p.numel() for p in model.parameters()):,}\")"
    ]
   },
     "    Random genomic windows from a reference genome + bigWig signal.\n",
     "\n",
     "    Each sample:\n",
+    "        - picks a random region from the specified split,\n",
+    "        - picks a random window of length `sequence_length` within that region,\n",
     "        - returns (sequence, signal, chrom, start, end).\n",
     "\n",
     "    This dataset is compatible with multi-worker DataLoaders. BigWig files\n",
     "    ----\n",
     "    fasta_path : str\n",
     "        Path to the reference genome FASTA (e.g. hg38.fna).\n",
+    "    bigwig_path_list : list[str]\n",
+    "        List of paths to bigWig files.\n",
+    "    chrom_regions : pd.DataFrame\n",
+    "        DataFrame with columns: chr_name, start, end, split.\n",
+    "        Contains all genomic regions with their split assignments.\n",
+    "    split : str\n",
+    "        Split name to filter regions (e.g., \"train\", \"val\", \"test\").\n",
     "    sequence_length : int\n",
     "        Length of each random window (in bp).\n",
     "    num_samples : int\n",
     "        Function to transform/scaling bigwig targets.\n",
     "    keep_target_center_fraction : float\n",
     "        Fraction of center sequence to keep for target prediction (crops edges to focus on center).\n",
     "    \"\"\"\n",
     "\n",
     "    def __init__(\n",
     "        self,\n",
     "        fasta_path: str,\n",
     "        bigwig_path_list: list[str],\n",
+    "        chrom_regions: pd.DataFrame,\n",
+    "        split: str,\n",
     "        sequence_length: int,\n",
     "        num_samples: int,\n",
     "        tokenizer: AutoTokenizer,\n",
     "        self.sequence_length = sequence_length\n",
     "        self.num_samples = num_samples\n",
     "        self.tokenizer = tokenizer\n",
+    "        self.transform_fn = transform_fn\n",
     "        self.keep_target_center_fraction = keep_target_center_fraction\n",
+    "        self.chrom_regions = chrom_regions\n",
     "\n",
+    "        # Filter regions by split\n",
+    "        split_regions = self.chrom_regions[self.chrom_regions[\"split\"] == split].copy()\n",
     "\n",
+    "        # Filter valid regions (must be large enough for sequence_length)\n",
+    "        self.valid_regions = []\n",
+    "        for _, row in split_regions.iterrows():\n",
     "\n",
+    "            region_length = row.end - row.start\n",
+    "            if region_length < self.sequence_length:\n",
     "                continue\n",
+    "            \n",
+    "            # Store valid region\n",
+    "            self.valid_regions.append((row.chr_name, row.start, row.end))\n",
     "\n",
+    "        if not self.valid_regions:\n",
+    "            raise ValueError(f\"No valid regions found for split '{split}'\")\n",
     "\n",
     "    def __len__(self):\n",
     "        return self.num_samples\n",
     "\n",
     "    def __getitem__(self, idx):\n",
+    "        # Sample a random region from the valid regions\n",
+    "        chrom, region_start, region_end = random.choice(self.valid_regions)\n",
+    "        \n",
+    "        # Sample a random window within this region\n",
+    "        max_start = region_end - self.sequence_length\n",
+    "        start = random.randint(region_start, max_start)\n",
     "        end = start + self.sequence_length\n",
     "\n",
     "        # Sequence\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "def create_targets_scaling_fn(\n",
+    "    metadata_df: pd.DataFrame\n",
+    ") -> Callable[[torch.Tensor], torch.Tensor]:\n",
     "    \"\"\"\n",
+    "    Build a scaling function based on track means contained in the metadata.\n",
     "\n",
     "    Args:\n",
+    "        metadata_df: pandas.DataFrame with track means\n",
     "\n",
     "    Returns:\n",
     "        Transform function that scales input tensors\n",
     "    \"\"\"\n",
     "    # Open bigwig files and compute track statistics\n",
+    "    track_means = metadata_df[\"mean\"].to_numpy()\n",
+    "    print(f\"Track means: {track_means}\")\n",
+    "    print(f\"Number of tracks: {track_means.shape}\")\n",
+    "\n",
     "    # Create tensor from computed means\n",
     "    track_means_tensor = torch.tensor(track_means, dtype=torch.float32)\n",
+    "\n",
     "    def transform_fn(x: torch.Tensor) -> torch.Tensor:\n",
     "        \"\"\"\n",
     "        x: torch.Tensor, shape (seq_len, num_tracks) or (batch, seq_len, num_tracks)\n",
     "            scaled,\n",
     "        )\n",
     "        return clipped\n",
+    "\n",
     "    return transform_fn"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
     "create_dataset_fn = functools.partial(\n",
     "    GenomeBigWigDataset,\n",
     "    fasta_path=fasta_path,\n",
+    "    bigwig_path_list=bigwig_paths,\n",
+    "    chrom_regions=species_splits_df,\n",
     "    sequence_length=config[\"sequence_length\"],\n",
     "    tokenizer=tokenizer,\n",
+    "    transform_fn=create_targets_scaling_fn(metadata_df),\n",
     "    keep_target_center_fraction=config[\"keep_target_center_fraction\"],\n",
     ")\n",
     "\n",
     "train_dataset = create_dataset_fn(\n",
+    "    split=\"train\",\n",
     "    num_samples=config[\"num_steps_training\"] * config[\"batch_size\"],\n",
     ")\n",
     "\n",
     "val_dataset = create_dataset_fn(\n",
+    "    split=\"val\",\n",
     "    num_samples=config[\"num_validation_samples\"],\n",
     ")\n",
     "\n",
     "test_dataset = create_dataset_fn(\n",
+    "    split=\"test\",\n",
     "    num_samples=config[\"num_test_samples\"],\n",
     ")\n",
     "\n",
     "    num_workers=config[\"num_workers\"],\n",
     ")\n",
     "\n",
+    "print(f\"\\nTrain samples: {len(train_dataset)}\")\n",
     "print(f\"Val samples: {len(val_dataset)}\")\n",
     "print(f\"Test samples: {len(test_dataset)}\")"
    ]
    "metadata": {},
    "outputs": [],
    "source": [
+    "train_metrics = TracksMetrics(bigwig_ids)\n",
+    "val_metrics = TracksMetrics(bigwig_ids)\n",
+    "test_metrics = TracksMetrics(bigwig_ids)"
    ]
   },
   {
     "val_losses = []\n",
     "val_pearson_scores = []\n",
     "\n",
     "# Create iterator for training data (will cycle if needed)\n",
     "train_iter = iter(train_loader)\n",
     "\n",
     "        train_losses.append(mean_loss)\n",
     "        train_pearson_scores.append(train_metrics_dict['mean/pearson'])\n",
     "        \n",
     "        \n",
     "        print(\n",
     "            f\"Step {step_idx + 1}/{config['num_steps_training']} | \"\n",
     "        val_losses.append(val_metrics_dict['loss'])\n",
     "        val_pearson_scores.append(val_pearson_mean)\n",
     "        \n",
     "        \n",
     "        print(f\"  Validation Loss: {val_metrics_dict['loss']:.4f}\")\n",
     "        print(f\"  Validation Mean Pearson: {val_pearson_mean:.4f}\")\n",
     "        \n",
     "        model.train()  # Back to training mode\n",
     "\n",
+    "print(f\"\\nTraining completed after {config['num_steps_training']} steps.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot training results\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
+    "\n",
+    "# Plot Loss\n",
+    "axes[0].plot(train_steps, train_losses, 'b-o', label='Train Loss', markersize=4, linewidth=1.5)\n",
+    "if val_steps:\n",
+    "    axes[0].plot(val_steps, val_losses, 'r-s', label='Val Loss', markersize=4, linewidth=1.5)\n",
+    "axes[0].set_xlabel('Step')\n",
+    "axes[0].set_ylabel('Loss')\n",
+    "axes[0].set_title('Loss')\n",
+    "axes[0].legend()\n",
+    "axes[0].grid(True, alpha=0.3)\n",
+    "\n",
+    "# Plot Pearson Correlation\n",
+    "axes[1].plot(train_steps, train_pearson_scores, 'g-o', label='Train Pearson', markersize=4, linewidth=1.5)\n",
+    "if val_steps:\n",
+    "    axes[1].plot(val_steps, val_pearson_scores, 'orange', marker='s', label='Val Pearson', markersize=4, linewidth=1.5)\n",
+    "axes[1].set_xlabel('Step')\n",
+    "axes[1].set_ylabel('Pearson Correlation')\n",
+    "axes[1].set_title('Mean Pearson Correlation')\n",
+    "axes[1].legend()\n",
+    "axes[1].grid(True, alpha=0.3)\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
    ]
   },
   {