Spaces:

InstaDeepAI
/

ntv3

Running

App Files Files Community

ybornachot commited on Dec 20, 2025

Commit

2f2efdf

1 Parent(s): 35ab8fa

fix: corrected paths for restrained list of bigwigs

Browse files

Files changed (1) hide show

notebooks/03_fine_tuning.ipynb +98 -31

notebooks/03_fine_tuning.ipynb CHANGED Viewed

@@ -139,10 +139,13 @@
     "    \"data_cache_dir\": \"./data\",\n",
     "    \"sequence_length\": 32_768,\n",
     "    \"keep_target_center_fraction\": 0.375,\n",
     "    \n",
     "    # Training\n",
     "    \"batch_size\": 12,\n",
-    "    \"num_steps_training\": 5315,  # reproduce 10% of benchmark training length\n",
     "    \"log_every_n_steps\": 20,\n",
     "    \"learning_rate\": 1e-5,\n",
     "    \"weight_decay\": 0.01,\n",
@@ -196,13 +199,23 @@
     "    species: str,\n",
     "    data_cache_dir: str | Path = \"data\",\n",
     "    hf_repo_id: str = \"InstaDeepAI/NTv3_benchmark_dataset\",\n",
     ") -> tuple[str, list[str], list[str]]:\n",
     "    \"\"\"\n",
     "    Downloads:\n",
     "      1) FASTA from HF dataset under: <species>/genome.fasta\n",
     "      2) BigWigs from HF dataset under: <species>/functional_tracks/**\n",
     "      3) Splits from HF dataset under: <species>/splits.bed\n",
     "      4) Metadata from HF dataset under: benchmark_metadata.tsv\n",
     "    Returns:\n",
     "      (fasta_path, bigwig_path_list, bigwig_file_ids)\n",
     "    \"\"\"\n",
@@ -210,16 +223,36 @@
     "    cache.mkdir(parents=True, exist_ok=True)\n",
     "    \n",
     "    # --- Download metadata + <species> files (FASTA, BigWigs, Splits) ---\n",
-    "    api = HfApi()\n",
-    "    files = api.list_repo_files(repo_id=hf_repo_id, repo_type=\"dataset\")\n",
-    "    \n",
-    "    # Find all files to download: species directory + metadata at root\n",
-    "    species_pattern = f\"{species}/**\"\n",
     "    metadata_file = \"benchmark_metadata.tsv\"\n",
-    "    species_files = [p for p in files if fnmatch.fnmatch(p, species_pattern)]\n",
     "    \n",
-    "    # Download all needed files\n",
-    "    download_patterns = [species_pattern, metadata_file]\n",
     "    local_dir = Path(\n",
     "        snapshot_download(\n",
     "            repo_id=hf_repo_id,\n",
@@ -236,15 +269,16 @@
     "    if not Path(fasta_path).is_file():\n",
     "        raise ValueError(f\"FASTA file not found at '{fasta_path}'\")\n",
     "    \n",
-    "    # BigWig files\n",
-    "    bigwig_paths, bigwig_ids = [], []\n",
-    "    for repo_path in species_files:\n",
-    "        lp = local_dir / repo_path\n",
-    "        if lp.is_file() and lp.suffix == \".bigwig\":\n",
-    "            bigwig_paths.append(str(lp))\n",
-    "            bigwig_ids.append(lp.stem)\n",
-    "    if not bigwig_paths:\n",
-    "        raise ValueError(f\"Found no BigWig files in '{species_pattern}'\")\n",
     "    \n",
     "    # Splits file\n",
     "    splits_path_repo = f\"{species}/splits.bed\"\n",
@@ -284,7 +318,7 @@
    "source": [
     "os.makedirs(config[\"data_cache_dir\"], exist_ok=True)\n",
     "\n",
-    "# Download all species files + load the splits, and metadata\n",
     "(\n",
     "    fasta_path, \n",
     "    bigwig_paths, \n",
@@ -294,7 +328,8 @@
     ") = prepare_genomics_inputs(\n",
     "    config[\"species\"], \n",
     "    config[\"data_cache_dir\"], \n",
-    "    config[\"hf_repo_id\"]\n",
     ")"
    ]
   },
@@ -348,11 +383,12 @@
     "        \n",
     "        # Load config and model\n",
     "        self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\n",
-    "        self.backbone = AutoModelForMaskedLM.from_pretrained(\n",
     "            model_name, \n",
     "            trust_remote_code=True,\n",
     "            config=self.config,\n",
     "        )\n",
     "        \n",
     "        self.keep_target_center_fraction = keep_target_center_fraction\n",
     "\n",
@@ -428,18 +464,48 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Process-local cache for BigWig file handles (one per worker process)\n",
     "# This allows safe multi-worker DataLoader usage\n",
     "_bigwig_cache = {}  # Maps (process_id, file_path) -> pyBigWig handle\n",
     "\n",
     "\n",
     "def _get_bigwig_handle(bigwig_path: str) -> pyBigWig.pyBigWig:\n",
     "    \"\"\"Get or create a BigWig file handle for the current process.\"\"\"\n",
     "    process_id = os.getpid()\n",
-    "    cache_key = (process_id, bigwig_path)\n",
     "    \n",
     "    if cache_key not in _bigwig_cache:\n",
-    "        _bigwig_cache[cache_key] = pyBigWig.open(bigwig_path)\n",
     "    \n",
     "    return _bigwig_cache[cache_key]\n",
     "\n",
@@ -494,8 +560,8 @@
     "    ):\n",
     "        super().__init__()\n",
     "\n",
-    "        self.fasta = Fasta(fasta_path, as_raw=True, sequence_always_upper=True)\n",
     "        # Store paths instead of opening files immediately (for multi-worker compatibility)\n",
     "        self.bigwig_path_list = bigwig_path_list\n",
     "        self.sequence_length = sequence_length\n",
     "        self.num_samples = num_samples\n",
@@ -533,8 +599,9 @@
     "        start = random.randint(region_start, max_start)\n",
     "        end = start + self.sequence_length\n",
     "\n",
-    "        # Sequence\n",
-    "        seq = self.fasta[chrom][start:end]  # string slice\n",
     "        # Tokenize with padding and truncation to ensure consistent lengths for batching\n",
     "        tokenized = self.tokenizer(\n",
     "            seq,\n",
@@ -788,7 +855,8 @@
     "        metrics_dict = {}\n",
     "        \n",
     "        # Compute Pearson correlation per track\n",
-    "        correlations = self.pearson_metric.compute().numpy()\n",
     "        for i, track_name in enumerate(self.track_names):\n",
     "            metrics_dict[f\"{track_name}/pearson\"] = correlations[i]\n",
     "        \n",
@@ -931,7 +999,6 @@
     "    loss.backward()\n",
     "    return loss.item()\n",
     "\n",
-    "\n",
     "def validation_step(\n",
     "    model: nn.Module,\n",
     "    batch: Dict[str, torch.Tensor],\n",
@@ -1149,10 +1216,10 @@
    "source": [
     " ## Test set results\n",
     "\n",
-    "Performances reached at ~1.5B tokens (~1500 steps in current 32kb sequences setup with batch_size=32)\n",
-    "\n",
     "**Hardware configuration**: These results were obtained on an **H100 GPU with 16 workers** for data loading in approximately **~10 minutes** of training.\n",
     "\n",
     "Mean Pearson: 0.5835\n",
     "- ENCSR325NFE/pearson: 0.6081\n",
     "- ENCSR962OTG/pearson: 0.7286\n",

     "    \"data_cache_dir\": \"./data\",\n",
     "    \"sequence_length\": 32_768,\n",
     "    \"keep_target_center_fraction\": 0.375,\n",
+    "    \"bigwig_file_ids\": [\n",
+    "        \"ENCSR325NFE\", \"ENCSR962OTG\", \"ENCSR619DQO_P\", \"ENCSR619DQO_M\"\n",
+    "    ],  # If None, will use all available tracks for selected species\n",
     "    \n",
     "    # Training\n",
     "    \"batch_size\": 12,\n",
+    "    \"num_steps_training\": 2000,  # Consider increasing for improving training performance\n",
     "    \"log_every_n_steps\": 20,\n",
     "    \"learning_rate\": 1e-5,\n",
     "    \"weight_decay\": 0.01,\n",
     "    species: str,\n",
     "    data_cache_dir: str | Path = \"data\",\n",
     "    hf_repo_id: str = \"InstaDeepAI/NTv3_benchmark_dataset\",\n",
+    "    bigwig_file_ids: list[str] | None = None,\n",
     ") -> tuple[str, list[str], list[str]]:\n",
     "    \"\"\"\n",
     "    Downloads:\n",
     "      1) FASTA from HF dataset under: <species>/genome.fasta\n",
     "      2) BigWigs from HF dataset under: <species>/functional_tracks/**\n",
+    "         (filtered by bigwig_file_ids if provided)\n",
     "      3) Splits from HF dataset under: <species>/splits.bed\n",
     "      4) Metadata from HF dataset under: benchmark_metadata.tsv\n",
+    "    \n",
+    "    Args:\n",
+    "        species: Species name (e.g., \"human\", \"arabidopsis\")\n",
+    "        data_cache_dir: Directory where downloaded data files will be stored\n",
+    "        hf_repo_id: HuggingFace dataset repository ID\n",
+    "        bigwig_file_ids: Optional list of BigWig file IDs to download. If None,\n",
+    "            downloads all available BigWig files for the species.\n",
+    "    \n",
     "    Returns:\n",
     "      (fasta_path, bigwig_path_list, bigwig_file_ids)\n",
     "    \"\"\"\n",
     "    cache.mkdir(parents=True, exist_ok=True)\n",
     "    \n",
     "    # --- Download metadata + <species> files (FASTA, BigWigs, Splits) ---\n",
     "    metadata_file = \"benchmark_metadata.tsv\"\n",
+    "    download_patterns = [metadata_file, f\"{species}/genome.fasta\", f\"{species}/splits.bed\"]\n",
     "    \n",
+    "    if bigwig_file_ids is not None:\n",
+    "        # List files to validate requested BigWig files exist\n",
+    "        api = HfApi()\n",
+    "        files = api.list_repo_files(repo_id=hf_repo_id, repo_type=\"dataset\")\n",
+    "        species_pattern = f\"{species}/**\"\n",
+    "        species_files = [p for p in files if fnmatch.fnmatch(p, species_pattern)]\n",
+    "        \n",
+    "        # Get all available BigWig file IDs and their paths\n",
+    "        available_bigwig_files = {\n",
+    "            Path(p).stem: p for p in species_files \n",
+    "            if Path(p).suffix == \".bigwig\"\n",
+    "        }\n",
+    "        \n",
+    "        # Check that all requested files exist\n",
+    "        missing_files = set(bigwig_file_ids) - set(available_bigwig_files.keys())\n",
+    "        if missing_files:\n",
+    "            raise ValueError(\n",
+    "                f\"Requested BigWig files not found: {missing_files}. \"\n",
+    "                f\"Available files: {list(available_bigwig_files.keys())}\"\n",
+    "            )\n",
+    "        \n",
+    "        # Add specific patterns for requested BigWig files only\n",
+    "        for file_id in bigwig_file_ids:\n",
+    "            download_patterns.append(available_bigwig_files[file_id])\n",
+    "    else:\n",
+    "        # Download all BigWig files\n",
+    "        download_patterns.append(f\"{species}/functional_tracks/*.bigwig\")\n",
     "    local_dir = Path(\n",
     "        snapshot_download(\n",
     "            repo_id=hf_repo_id,\n",
     "    if not Path(fasta_path).is_file():\n",
     "        raise ValueError(f\"FASTA file not found at '{fasta_path}'\")\n",
     "    \n",
+    "    # BigWig files - use downloaded files directly\n",
+    "    bigwig_dir = local_dir / species / \"functional_tracks\"\n",
+    "    \n",
+    "    if bigwig_file_ids is not None:\n",
+    "        bigwig_paths = [bigwig_dir / f\"{file_id}.bigwig\" for file_id in bigwig_file_ids]\n",
+    "        bigwig_ids = bigwig_file_ids\n",
+    "    else:\n",
+    "        # Find all downloaded BigWig files\n",
+    "        bigwig_paths = [bigwig_file for bigwig_file in bigwig_dir.glob(\"*.bigwig\")]\n",
+    "        bigwig_ids = [bigwig_file.stem for bigwig_file in bigwig_dir.glob(\"*.bigwig\")]         \n",
     "    \n",
     "    # Splits file\n",
     "    splits_path_repo = f\"{species}/splits.bed\"\n",
    "source": [
     "os.makedirs(config[\"data_cache_dir\"], exist_ok=True)\n",
     "\n",
+    "# Download all requested species-related files + load the splits, and metadata\n",
     "(\n",
     "    fasta_path, \n",
     "    bigwig_paths, \n",
     ") = prepare_genomics_inputs(\n",
     "    config[\"species\"], \n",
     "    config[\"data_cache_dir\"], \n",
+    "    config[\"hf_repo_id\"],\n",
+    "    bigwig_file_ids=config[\"bigwig_file_ids\"]\n",
     ")"
    ]
   },
     "        \n",
     "        # Load config and model\n",
     "        self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\n",
+    "        backbone = AutoModelForMaskedLM.from_pretrained(\n",
     "            model_name, \n",
     "            trust_remote_code=True,\n",
     "            config=self.config,\n",
     "        )\n",
+    "        self.backbone = torch.compile(backbone)\n",
     "        \n",
     "        self.keep_target_center_fraction = keep_target_center_fraction\n",
     "\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Process-local cache for file handles (one per worker process)\n",
     "# This allows safe multi-worker DataLoader usage\n",
+    "_fasta_cache = {}  # Maps (process_id, file_path) -> Fasta handle\n",
     "_bigwig_cache = {}  # Maps (process_id, file_path) -> pyBigWig handle\n",
     "\n",
     "\n",
+    "def _get_fasta_handle(fasta_path: str) -> Fasta:\n",
+    "    \"\"\"Get or create a FASTA file handle for the current process.\"\"\"\n",
+    "    process_id = os.getpid()\n",
+    "    abs_path = str(Path(fasta_path).resolve())\n",
+    "    cache_key = (process_id, abs_path)\n",
+    "    \n",
+    "    if cache_key not in _fasta_cache:\n",
+    "        _fasta_cache[cache_key] = Fasta(abs_path, as_raw=True, sequence_always_upper=True)\n",
+    "    \n",
+    "    return _fasta_cache[cache_key]\n",
+    "\n",
+    "\n",
     "def _get_bigwig_handle(bigwig_path: str) -> pyBigWig.pyBigWig:\n",
     "    \"\"\"Get or create a BigWig file handle for the current process.\"\"\"\n",
     "    process_id = os.getpid()\n",
+    "    abs_path = str(Path(bigwig_path).resolve())\n",
+    "    cache_key = (process_id, abs_path)\n",
     "    \n",
     "    if cache_key not in _bigwig_cache:\n",
+    "        # Check if file exists before trying to open\n",
+    "        if not Path(abs_path).exists():\n",
+    "            raise FileNotFoundError(\n",
+    "                f\"BigWig file not found: {abs_path}\\n\"\n",
+    "                f\"Original path: {bigwig_path}\\n\"\n",
+    "                f\"Current working directory: {os.getcwd()}\"\n",
+    "            )\n",
+    "        \n",
+    "        try:\n",
+    "            _bigwig_cache[cache_key] = pyBigWig.open(abs_path)\n",
+    "        except Exception as e:\n",
+    "            raise RuntimeError(\n",
+    "                f\"Failed to open BigWig file: {abs_path}\\n\"\n",
+    "                f\"Error: {str(e)}\\n\"\n",
+    "                f\"File exists: {Path(abs_path).exists()}\\n\"\n",
+    "                f\"File size: {Path(abs_path).stat().st_size if Path(abs_path).exists() else 'N/A'} bytes\"\n",
+    "            ) from e\n",
     "    \n",
     "    return _bigwig_cache[cache_key]\n",
     "\n",
     "    ):\n",
     "        super().__init__()\n",
     "\n",
     "        # Store paths instead of opening files immediately (for multi-worker compatibility)\n",
+    "        self.fasta_path = fasta_path\n",
     "        self.bigwig_path_list = bigwig_path_list\n",
     "        self.sequence_length = sequence_length\n",
     "        self.num_samples = num_samples\n",
     "        start = random.randint(region_start, max_start)\n",
     "        end = start + self.sequence_length\n",
     "\n",
+    "        # Sequence - get FASTA handle lazily (cached per worker process)\n",
+    "        fasta = _get_fasta_handle(self.fasta_path)\n",
+    "        seq = fasta[chrom][start:end]  # string slice\n",
     "        # Tokenize with padding and truncation to ensure consistent lengths for batching\n",
     "        tokenized = self.tokenizer(\n",
     "            seq,\n",
     "        metrics_dict = {}\n",
     "        \n",
     "        # Compute Pearson correlation per track\n",
+    "        # Move to CPU before converting to numpy\n",
+    "        correlations = self.pearson_metric.compute().cpu().numpy()\n",
     "        for i, track_name in enumerate(self.track_names):\n",
     "            metrics_dict[f\"{track_name}/pearson\"] = correlations[i]\n",
     "        \n",
     "    loss.backward()\n",
     "    return loss.item()\n",
     "\n",
     "def validation_step(\n",
     "    model: nn.Module,\n",
     "    batch: Dict[str, torch.Tensor],\n",
    "source": [
     " ## Test set results\n",
     "\n",
     "**Hardware configuration**: These results were obtained on an **H100 GPU with 16 workers** for data loading in approximately **~10 minutes** of training.\n",
     "\n",
+    "Performances reached at ~1.5B tokens (~1500 steps in current 32kb sequences setup with batch_size=32)\n",
+    "\n",
     "Mean Pearson: 0.5835\n",
     "- ENCSR325NFE/pearson: 0.6081\n",
     "- ENCSR962OTG/pearson: 0.7286\n",