Add HuggingFace token auth support for gated datasets
Browse filesAdd hf_token field to DataConfig and pass it to load_dataset() in
PackedStreamingDataset, ValidationDataset, and MixedStreamingDataset
to support gated datasets such as bigcode/starcoderdata. Update
07_cpt_code.ipynb with a login() cell and restructured config cell.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- llm_lab/config/data_config.py +5 -0
- llm_lab/data/dataset.py +3 -0
- notebooks/07_cpt_code.ipynb +9 -6
llm_lab/config/data_config.py
CHANGED
|
@@ -48,6 +48,11 @@ class DataConfig:
|
|
| 48 |
"""Sampling weight for each dataset (primary first, then mix_datasets in order).
|
| 49 |
Must sum to 1.0. Example: [0.2, 0.8] means 20% primary + 80% secondary."""
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
@classmethod
|
| 52 |
def code_cpt(cls) -> "DataConfig":
|
| 53 |
"""Data config for Code CPT (Continual Pre-Training).
|
|
|
|
| 48 |
"""Sampling weight for each dataset (primary first, then mix_datasets in order).
|
| 49 |
Must sum to 1.0. Example: [0.2, 0.8] means 20% primary + 80% secondary."""
|
| 50 |
|
| 51 |
+
# ── Authentication ──
|
| 52 |
+
hf_token: Optional[str] = None
|
| 53 |
+
"""HuggingFace token for accessing gated datasets (e.g. bigcode/starcoderdata).
|
| 54 |
+
If None, falls back to the cached token from huggingface_hub.login() or HF_TOKEN env var."""
|
| 55 |
+
|
| 56 |
@classmethod
|
| 57 |
def code_cpt(cls) -> "DataConfig":
|
| 58 |
"""Data config for Code CPT (Continual Pre-Training).
|
llm_lab/data/dataset.py
CHANGED
|
@@ -64,6 +64,7 @@ class PackedStreamingDataset(IterableDataset):
|
|
| 64 |
split=self.split,
|
| 65 |
streaming=True, # Key: streaming mode
|
| 66 |
trust_remote_code=True,
|
|
|
|
| 67 |
)
|
| 68 |
|
| 69 |
# Full partitioning (sharding): worker i processes only 1/num_shards of the stream
|
|
@@ -186,6 +187,7 @@ class ValidationDataset:
|
|
| 186 |
split=self.config.dataset_split,
|
| 187 |
streaming=True,
|
| 188 |
trust_remote_code=True,
|
|
|
|
| 189 |
)
|
| 190 |
# Use a different seed and skip the beginning to avoid overlap with training data
|
| 191 |
ds = ds.shuffle(seed=seed, buffer_size=5_000)
|
|
@@ -289,6 +291,7 @@ class MixedStreamingDataset(IterableDataset):
|
|
| 289 |
split=spec["split"],
|
| 290 |
streaming=True,
|
| 291 |
trust_remote_code=True,
|
|
|
|
| 292 |
)
|
| 293 |
if num_shards > 1:
|
| 294 |
ds = ds.shard(num_shards=num_shards, index=shard_index)
|
|
|
|
| 64 |
split=self.split,
|
| 65 |
streaming=True, # Key: streaming mode
|
| 66 |
trust_remote_code=True,
|
| 67 |
+
token=self.config.hf_token,
|
| 68 |
)
|
| 69 |
|
| 70 |
# Full partitioning (sharding): worker i processes only 1/num_shards of the stream
|
|
|
|
| 187 |
split=self.config.dataset_split,
|
| 188 |
streaming=True,
|
| 189 |
trust_remote_code=True,
|
| 190 |
+
token=self.config.hf_token,
|
| 191 |
)
|
| 192 |
# Use a different seed and skip the beginning to avoid overlap with training data
|
| 193 |
ds = ds.shuffle(seed=seed, buffer_size=5_000)
|
|
|
|
| 291 |
split=spec["split"],
|
| 292 |
streaming=True,
|
| 293 |
trust_remote_code=True,
|
| 294 |
+
token=self.config.hf_token,
|
| 295 |
)
|
| 296 |
if num_shards > 1:
|
| 297 |
ds = ds.shard(num_shards=num_shards, index=shard_index)
|
notebooks/07_cpt_code.ipynb
CHANGED
|
@@ -55,13 +55,16 @@
|
|
| 55 |
]
|
| 56 |
},
|
| 57 |
{
|
| 58 |
-
"cell_type": "
|
|
|
|
| 59 |
"metadata": {},
|
| 60 |
-
"
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"cell_type": "code",
|
|
|
|
| 55 |
]
|
| 56 |
},
|
| 57 |
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"source": "# HuggingFace authentication — required for bigcode/starcoderdata (gated dataset)\n# Visit https://huggingface.co/bigcode/starcoderdata and accept the license first.\nfrom huggingface_hub import login\nlogin() # Enter your HuggingFace token when prompted (starts with \"hf_\")",
|
| 60 |
"metadata": {},
|
| 61 |
+
"execution_count": null,
|
| 62 |
+
"outputs": []
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "code",
|
| 66 |
+
"metadata": {},
|
| 67 |
+
"source": "# --- Model configuration (same architecture as pretraining) ---\nmodel_config = ModelConfig.base_1b()\n\n# --- Data configuration (80% code + 20% general) ---\ndata_config = DataConfig.code_cpt()\ndata_config.max_seq_len = model_config.max_seq_len\ndata_config.batch_size = 4\n# Optional: set token explicitly instead of login() above\n# data_config.hf_token = \"hf_your_token_here\"\n\n# --- Training configuration (lower LR, fresh optimizer) ---\ntrain_config = TrainConfig.code_cpt_1b()\ntrain_config.wandb_dir = \"/content/drive/MyDrive/wandb_logs\"\n\n# --- Path to the pretrained base checkpoint ---\nPRETRAINED_CKPT_DIR = \"/content/drive/MyDrive/llm-1b-lab/checkpoints\"\n\nprint(f\"Effective batch size: {train_config.effective_batch_size}\")\nprint(f\"Total CPT steps: {train_config.total_steps:,}\")\nprint(f\"Estimated CPT tokens: {train_config.total_steps * train_config.effective_batch_size * model_config.max_seq_len / 1e9:.1f}B\")\nprint(f\"Peak LR: {train_config.learning_rate}\")\nprint(f\"Data mix: {data_config.mix_weights}\")"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"cell_type": "code",
|