ASTERIZER
/

LUNA-Training

Model card Files Files and versions

xet

Community

ASTERIZER commited on Apr 2

Commit

060646a

verified ·

1 Parent(s): 5ea0407

Upload train_config.yaml with huggingface_hub

Browse files

Files changed (1) hide show

train_config.yaml +67 -0

train_config.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+# ─────────────────────────────────────────────────────────────────────────────
+#  LUNA 100M — Training Configuration
+#  Single source of truth for all hyperparameters.
+#
+#  auto_config: true   → All batch/LR/worker settings are auto-detected from
+#                        available VRAM / RAM / CPU at runtime. Your values below
+#                        are used as FALLBACKS only if detection fails.
+#
+#  auto_config: false  → Every value below is used as-is. Nothing is overridden.
+#                        Use this when you've already benchmarked and want
+#                        repeatable, fixed runs.
+# ─────────────────────────────────────────────────────────────────────────────
+auto_config: true   # ← flip to false to lock everything below
+# ── Data ──────────────────────────────────────────────────────────────────────
+data_path:     "Base/data/litdata_pretrain_final"   # local default; overridden by --data_path
+out_dir:       "out/pretrain/luna-100m"
+tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"
+# ── Model (fixed for LUNA-100M — do not change) ───────────────────────────────
+model:
+  vocab_size: 50304      # ceil(50277/128)*128 — pythia tokenizer with EOS padding
+  seq_len:    1024
+  n_layer:    10
+  n_embd:     768
+  n_head:     12
+# ── Training budget ───────────────────────────────────────────────────────────
+train:
+  max_tokens:       4515286950   # full dataset (verified from index.json, 270 chunks)
+  lr_warmup_steps:  500          # [AUTO] scaled to 5% of total_steps if auto_config
+  save_interval:    1000         # save checkpoint every N optimizer steps
+  log_interval:     10           # print log every N steps
+  max_norm:         1.0          # gradient clip norm
+# ── Optimiser ─────────────────────────────────────────────────────────────────
+optimizer:
+  lr:           0.0006    # 6e-4  [AUTO] scaled by sqrt(global_batch/120) if auto_config
+  min_lr:       0.00006   # 6e-5
+  weight_decay: 0.1
+  betas:        [0.9, 0.95]
+  eps:          1.0e-8
+# ── Batch sizing ──────────────────────────────────────────────────────────────
+# When auto_config: true  → micro_batch and grad_accum are ignored; the script
+#                           probes VRAM and fills it to ~82% saturation, then
+#                           computes grad_accum to hit global_batch.
+# When auto_config: false → micro_batch × grad_accum must equal global_batch.
+batch:
+  global_batch: 120     # target total samples per optimizer step
+  micro_batch:  12      # [MANUAL] samples per GPU forward pass (ignored when auto)
+  grad_accum:   10      # [MANUAL] accumulation steps      (ignored when auto)
+# ── DataLoader ────────────────────────────────────────────────────────────────
+# When auto_config: true  → num_workers auto = cpu_cores // 2, capped by RAM
+# When auto_config: false → num_workers used as-is
+dataloader:
+  num_workers:  -1      # -1 = auto; set to 0 to disable multiprocessing
+  pin_memory:   true    # [AUTO] disabled if RAM < 16GB
+# ── Hardware / precision ──────────────────────────────────────────────────────
+# When auto_config: true  → precision detected from GPU compute capability
+# When auto_config: false → use the value below
+hardware:
+  precision:    "bf16"  # bf16 | fp16 | fp32
+  compile:      true    # torch.compile (requires Triton — Linux/cloud only)