diff --git "a/notebooks_tutorials/02_fine_tuning.ipynb" "b/notebooks_tutorials/02_fine_tuning.ipynb" --- "a/notebooks_tutorials/02_fine_tuning.ipynb" +++ "b/notebooks_tutorials/02_fine_tuning.ipynb" @@ -15,10 +15,9 @@ "- **Constant learning rate**: Uses a fixed learning rate throughout training without learning rate scheduling\n", "- **No gradient accumulation**: Implements simple step-based training without gradient accumulation, making the training loop more straightforward\n", "\n", - "**⚡ Key Advantage**: This simplified pipeline achieves close performance to more complex training approaches while enabling fast fine-tuning: on a H100 GPU and using 16 workers for data loading, it takes ~10min to reach acceptable performances for a 32kb functional tracks prediction task. The training speed benefits from the efficient NTv3 model architecture, but of course depends on your hardware capabilities (GPU acceleration and multi-worker data loading significantly reduce training time).\n", - "\n", - "**⚠️ Important Note on Hardware Requirements**: If the pipeline is designed to run on limited resources (e.g., Google Colab with a T4 GPU and 2CPUs), the timing mentioned was obtained on an **H100 GPU with 16 CPUs**. If you want to reach similar performance levels, you should be aware that you'll need **significant hardware resources** (high-end GPUs with substantial memory and multiple data loading workers). Training times will vary significantly based on your hardware configuration.\n", + "**⚡ Key Advantage**: This simplified pipeline achieves close performance to more complex training approaches while enabling fast fine-tuning: on a H100 GPU and using 16 workers for data loading, it takes ~15min to reach acceptable performances for a 32kb functional tracks prediction task on **NTv3_8M_pre** model. The training speed benefits from the efficient NTv3 model architecture, but of course depends on your hardware capabilities (GPU acceleration and multi-worker data loading significantly reduce training time).\n", "\n", + "**⚠️ Important Note on Hardware Requirements**: While this pipeline is designed to run on limited resources (e.g., Google Colab with a T4 GPU and 2CPUs), the mentioned training time or displayed performances (see **Test evaluation** section) was obtained on a more powerful setup. If you want to reach similar performance levels, you should be aware that you'll need **significant hardware resources** (high-end GPUs with substantial memory and multiple data loading workers). Training times will vary significantly based on your hardware configuration.\n", "\n", "The pipeline walks through the complete fine-tuning workflow:\n", "- Loading genomic FASTA files sequences and their corresponding BigWig signal tracks from Hugging Face dataset\n", @@ -27,7 +26,7 @@ "- Implementing a training loop with appropriate loss functions and evaluation metrics\n", "- Evaluation of the fine-tuned model on the test set\n", "\n", - "This provides a clean interface for training and evaluation.\n", + "This provides a clean interface for fine-tuning and evaluation.\n", "\n", "The model architecture consists of a pre-trained NTv3 backbone that processes DNA sequences and a custom linear head that predicts BigWig signal values at single-nucleotide resolution. Predictions are center-cropped to focus on the central portion of the input sequence (configurable via `keep_target_center_fraction`), which helps reduce edge effects from sequence context windows. The training uses a Poisson-Multinomial loss function that captures both the scale and shape of the signal distributions, and evaluation is performed using Pearson correlation metrics on both scaled and raw predictions.\n", "\n", @@ -50,7 +49,7 @@ "outputs": [], "source": [ "# Install dependencies\n", - "!pip install pyfaidx pyBigWig torchmetrics transformers plotly" + "!pip install pyfaidx pyBigWig torchmetrics transformers" ] }, { @@ -69,7 +68,6 @@ "\n", "# Third-party imports\n", "from huggingface_hub import HfApi, snapshot_download\n", - "from IPython.display import display\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", @@ -82,7 +80,7 @@ "from torch.utils.data import DataLoader, Dataset\n", "from torchmetrics import PearsonCorrCoef\n", "from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer\n", - "from tqdm import tqdm" + "from tqdm.auto import tqdm" ] }, { @@ -90,6 +88,15 @@ "metadata": {}, "source": [ "# 1. ⚙️ Configuration\n", + " \n", + "💡 **Tip:** The parameters below are pre-configured for minimal requirements and are suitable for running on a Colab GPU, but this may come at the cost of reduced model performance or slower training. \n", + " \n", + "Feel free to experiment with these parameters according to your available resources:\n", + "- If you have a more powerful GPU, **increase** `batch_size`, `learning_rate`, and `num_steps_training` for better performance and more robust training results.\n", + "- To speed up training (especially during data loading), consider increasing the `num_workers` value if memory and CPU resources allow.\n", + "\n", + "Current configuration allow to reach decent performances and completes training in ~1h30 on a colab environment with one T4 GPU and 2CPUs. \n", + "\n", "\n", "## Configuration Parameters\n", "\n", @@ -127,7 +134,15 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using device: cuda\n" + ] + } + ], "source": [ "config = {\n", " # Model\n", @@ -139,14 +154,12 @@ " \"data_cache_dir\": \"./data\",\n", " \"sequence_length\": 32_768,\n", " \"keep_target_center_fraction\": 0.375,\n", - " \"bigwig_file_ids\": [\n", - " \"ENCSR325NFE\", \"ENCSR962OTG\", \"ENCSR619DQO_P\", \"ENCSR619DQO_M\"\n", - " ], # If None, will use all available tracks for selected species\n", + " \"bigwig_file_ids\": None, # If None, will use all available tracks for selected species\n", " \n", " # Training\n", - " \"batch_size\": 12,\n", + " \"batch_size\": 8,\n", " \"num_steps_training\": 2000, # Consider increasing for improving training performance\n", - " \"log_every_n_steps\": 20,\n", + " \"log_every_n_steps\": 50,\n", " \"learning_rate\": 1e-5,\n", " \"weight_decay\": 0.01,\n", " \n", @@ -191,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -200,7 +213,7 @@ " data_cache_dir: str | Path = \"data\",\n", " hf_repo_id: str = \"InstaDeepAI/NTv3_benchmark_dataset\",\n", " bigwig_file_ids: list[str] | None = None,\n", - ") -> tuple[str, list[str], list[str]]:\n", + ") -> tuple[str, list[str], list[str], pd.DataFrame, pd.DataFrame]:\n", " \"\"\"\n", " Downloads:\n", " 1) FASTA from HF dataset under: /genome.fasta\n", @@ -273,11 +286,11 @@ " bigwig_dir = local_dir / species / \"functional_tracks\"\n", " \n", " if bigwig_file_ids is not None:\n", - " bigwig_paths = [bigwig_dir / f\"{file_id}.bigwig\" for file_id in bigwig_file_ids]\n", + " bigwig_paths = [str(bigwig_dir / f\"{file_id}.bigwig\") for file_id in bigwig_file_ids]\n", " bigwig_ids = bigwig_file_ids\n", " else:\n", " # Find all downloaded BigWig files\n", - " bigwig_paths = [bigwig_file for bigwig_file in bigwig_dir.glob(\"*.bigwig\")]\n", + " bigwig_paths = [str(bigwig_file) for bigwig_file in bigwig_dir.glob(\"*.bigwig\")]\n", " bigwig_ids = [bigwig_file.stem for bigwig_file in bigwig_dir.glob(\"*.bigwig\")] \n", " \n", " # Splits file\n", @@ -312,9 +325,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a428c235721b4637a92dfb0b1a6648d1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 37 files: 0%| | 0/37 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Plot training results\n", "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", @@ -1179,9 +2478,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running test evaluation with 312 steps (10000 samples)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1b3a925603fb4be5bc15c337654b4056", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Test evaluation: 0%| | 0/312 [00:00 **Note:** Results may vary depending on:\n", + "> - Configuration hyperparameters\n", + "> - Hardware configuration\n", + "> - Number of tracks used (subset vs. all available functional tracks)\n", + "> - Training duration\n", + "> \n", + "> For reference, NTv3-benchmark models on human data were trained for **20.9B tokens** before test set evaluation and achieved better performance than reported here." ] }, { @@ -1235,9 +2658,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.12 (ntv3-env)", + "display_name": ".venv", "language": "python", - "name": "ntv3-env" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1249,7 +2672,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.14" } }, "nbformat": 4,