Vjeong Claude Sonnet 4.6 commited on
Commit
1a63fd0
·
1 Parent(s): 3535442

Add HuggingFace token auth support for gated datasets

Browse files

Add hf_token field to DataConfig and pass it to load_dataset() in
PackedStreamingDataset, ValidationDataset, and MixedStreamingDataset
to support gated datasets such as bigcode/starcoderdata. Update
07_cpt_code.ipynb with a login() cell and restructured config cell.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

llm_lab/config/data_config.py CHANGED
@@ -48,6 +48,11 @@ class DataConfig:
48
  """Sampling weight for each dataset (primary first, then mix_datasets in order).
49
  Must sum to 1.0. Example: [0.2, 0.8] means 20% primary + 80% secondary."""
50
 
 
 
 
 
 
51
  @classmethod
52
  def code_cpt(cls) -> "DataConfig":
53
  """Data config for Code CPT (Continual Pre-Training).
 
48
  """Sampling weight for each dataset (primary first, then mix_datasets in order).
49
  Must sum to 1.0. Example: [0.2, 0.8] means 20% primary + 80% secondary."""
50
 
51
+ # ── Authentication ──
52
+ hf_token: Optional[str] = None
53
+ """HuggingFace token for accessing gated datasets (e.g. bigcode/starcoderdata).
54
+ If None, falls back to the cached token from huggingface_hub.login() or HF_TOKEN env var."""
55
+
56
  @classmethod
57
  def code_cpt(cls) -> "DataConfig":
58
  """Data config for Code CPT (Continual Pre-Training).
llm_lab/data/dataset.py CHANGED
@@ -64,6 +64,7 @@ class PackedStreamingDataset(IterableDataset):
64
  split=self.split,
65
  streaming=True, # Key: streaming mode
66
  trust_remote_code=True,
 
67
  )
68
 
69
  # Full partitioning (sharding): worker i processes only 1/num_shards of the stream
@@ -186,6 +187,7 @@ class ValidationDataset:
186
  split=self.config.dataset_split,
187
  streaming=True,
188
  trust_remote_code=True,
 
189
  )
190
  # Use a different seed and skip the beginning to avoid overlap with training data
191
  ds = ds.shuffle(seed=seed, buffer_size=5_000)
@@ -289,6 +291,7 @@ class MixedStreamingDataset(IterableDataset):
289
  split=spec["split"],
290
  streaming=True,
291
  trust_remote_code=True,
 
292
  )
293
  if num_shards > 1:
294
  ds = ds.shard(num_shards=num_shards, index=shard_index)
 
64
  split=self.split,
65
  streaming=True, # Key: streaming mode
66
  trust_remote_code=True,
67
+ token=self.config.hf_token,
68
  )
69
 
70
  # Full partitioning (sharding): worker i processes only 1/num_shards of the stream
 
187
  split=self.config.dataset_split,
188
  streaming=True,
189
  trust_remote_code=True,
190
+ token=self.config.hf_token,
191
  )
192
  # Use a different seed and skip the beginning to avoid overlap with training data
193
  ds = ds.shuffle(seed=seed, buffer_size=5_000)
 
291
  split=spec["split"],
292
  streaming=True,
293
  trust_remote_code=True,
294
+ token=self.config.hf_token,
295
  )
296
  if num_shards > 1:
297
  ds = ds.shard(num_shards=num_shards, index=shard_index)
notebooks/07_cpt_code.ipynb CHANGED
@@ -55,13 +55,16 @@
55
  ]
56
  },
57
  {
58
- "cell_type": "markdown",
 
59
  "metadata": {},
60
- "source": [
61
- "## 1. Configuration\n",
62
- "\n",
63
- "Use the CPT presets, which set appropriate LR, data mixing, and checkpoint paths."
64
- ]
 
 
65
  },
66
  {
67
  "cell_type": "code",
 
55
  ]
56
  },
57
  {
58
+ "cell_type": "code",
59
+ "source": "# HuggingFace authentication — required for bigcode/starcoderdata (gated dataset)\n# Visit https://huggingface.co/bigcode/starcoderdata and accept the license first.\nfrom huggingface_hub import login\nlogin() # Enter your HuggingFace token when prompted (starts with \"hf_\")",
60
  "metadata": {},
61
+ "execution_count": null,
62
+ "outputs": []
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "metadata": {},
67
+ "source": "# --- Model configuration (same architecture as pretraining) ---\nmodel_config = ModelConfig.base_1b()\n\n# --- Data configuration (80% code + 20% general) ---\ndata_config = DataConfig.code_cpt()\ndata_config.max_seq_len = model_config.max_seq_len\ndata_config.batch_size = 4\n# Optional: set token explicitly instead of login() above\n# data_config.hf_token = \"hf_your_token_here\"\n\n# --- Training configuration (lower LR, fresh optimizer) ---\ntrain_config = TrainConfig.code_cpt_1b()\ntrain_config.wandb_dir = \"/content/drive/MyDrive/wandb_logs\"\n\n# --- Path to the pretrained base checkpoint ---\nPRETRAINED_CKPT_DIR = \"/content/drive/MyDrive/llm-1b-lab/checkpoints\"\n\nprint(f\"Effective batch size: {train_config.effective_batch_size}\")\nprint(f\"Total CPT steps: {train_config.total_steps:,}\")\nprint(f\"Estimated CPT tokens: {train_config.total_steps * train_config.effective_batch_size * model_config.max_seq_len / 1e9:.1f}B\")\nprint(f\"Peak LR: {train_config.learning_rate}\")\nprint(f\"Data mix: {data_config.mix_weights}\")"
68
  },
69
  {
70
  "cell_type": "code",