Add HuggingFace token auth support for gated datasets

Add hf_token field to DataConfig and pass it to load_dataset() in
PackedStreamingDataset, ValidationDataset, and MixedStreamingDataset
to support gated datasets such as bigcode/starcoderdata. Update
07_cpt_code.ipynb with a login() cell and restructured config cell.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

llm_lab/config/data_config.py +5 -0
llm_lab/data/dataset.py +3 -0
notebooks/07_cpt_code.ipynb +9 -6

llm_lab/config/data_config.py CHANGED Viewed

@@ -48,6 +48,11 @@ class DataConfig:
     """Sampling weight for each dataset (primary first, then mix_datasets in order).
     Must sum to 1.0. Example: [0.2, 0.8] means 20% primary + 80% secondary."""
     @classmethod
     def code_cpt(cls) -> "DataConfig":
         """Data config for Code CPT (Continual Pre-Training).

     """Sampling weight for each dataset (primary first, then mix_datasets in order).
     Must sum to 1.0. Example: [0.2, 0.8] means 20% primary + 80% secondary."""
+    # ── Authentication ──
+    hf_token: Optional[str] = None
+    """HuggingFace token for accessing gated datasets (e.g. bigcode/starcoderdata).
+    If None, falls back to the cached token from huggingface_hub.login() or HF_TOKEN env var."""
     @classmethod
     def code_cpt(cls) -> "DataConfig":
         """Data config for Code CPT (Continual Pre-Training).

llm_lab/data/dataset.py CHANGED Viewed

@@ -64,6 +64,7 @@ class PackedStreamingDataset(IterableDataset):
             split=self.split,
             streaming=True,         # Key: streaming mode
             trust_remote_code=True,
         )
         # Full partitioning (sharding): worker i processes only 1/num_shards of the stream
@@ -186,6 +187,7 @@ class ValidationDataset:
             split=self.config.dataset_split,
             streaming=True,
             trust_remote_code=True,
         )
         # Use a different seed and skip the beginning to avoid overlap with training data
         ds = ds.shuffle(seed=seed, buffer_size=5_000)
@@ -289,6 +291,7 @@ class MixedStreamingDataset(IterableDataset):
             split=spec["split"],
             streaming=True,
             trust_remote_code=True,
         )
         if num_shards > 1:
             ds = ds.shard(num_shards=num_shards, index=shard_index)

             split=self.split,
             streaming=True,         # Key: streaming mode
             trust_remote_code=True,
+            token=self.config.hf_token,
         )
         # Full partitioning (sharding): worker i processes only 1/num_shards of the stream
             split=self.config.dataset_split,
             streaming=True,
             trust_remote_code=True,
+            token=self.config.hf_token,
         )
         # Use a different seed and skip the beginning to avoid overlap with training data
         ds = ds.shuffle(seed=seed, buffer_size=5_000)
             split=spec["split"],
             streaming=True,
             trust_remote_code=True,
+            token=self.config.hf_token,
         )
         if num_shards > 1:
             ds = ds.shard(num_shards=num_shards, index=shard_index)

notebooks/07_cpt_code.ipynb CHANGED Viewed

@@ -55,13 +55,16 @@
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 1. Configuration\n",
-    "\n",
-    "Use the CPT presets, which set appropriate LR, data mixing, and checkpoint paths."
-   ]
   },
   {
    "cell_type": "code",

    ]
   },
   {
+   "cell_type": "code",
+   "source": "# HuggingFace authentication — required for bigcode/starcoderdata (gated dataset)\n# Visit https://huggingface.co/bigcode/starcoderdata and accept the license first.\nfrom huggingface_hub import login\nlogin()  # Enter your HuggingFace token when prompted (starts with \"hf_\")",
    "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "# --- Model configuration (same architecture as pretraining) ---\nmodel_config = ModelConfig.base_1b()\n\n# --- Data configuration (80% code + 20% general) ---\ndata_config = DataConfig.code_cpt()\ndata_config.max_seq_len = model_config.max_seq_len\ndata_config.batch_size = 4\n# Optional: set token explicitly instead of login() above\n# data_config.hf_token = \"hf_your_token_here\"\n\n# --- Training configuration (lower LR, fresh optimizer) ---\ntrain_config = TrainConfig.code_cpt_1b()\ntrain_config.wandb_dir = \"/content/drive/MyDrive/wandb_logs\"\n\n# --- Path to the pretrained base checkpoint ---\nPRETRAINED_CKPT_DIR = \"/content/drive/MyDrive/llm-1b-lab/checkpoints\"\n\nprint(f\"Effective batch size: {train_config.effective_batch_size}\")\nprint(f\"Total CPT steps: {train_config.total_steps:,}\")\nprint(f\"Estimated CPT tokens: {train_config.total_steps * train_config.effective_batch_size * model_config.max_seq_len / 1e9:.1f}B\")\nprint(f\"Peak LR: {train_config.learning_rate}\")\nprint(f\"Data mix: {data_config.mix_weights}\")"
   },
   {
    "cell_type": "code",