ASTERIZER commited on
Commit
060646a
Β·
verified Β·
1 Parent(s): 5ea0407

Upload train_config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_config.yaml +67 -0
train_config.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ─────────────────────────────────────────────────────────────────────────────
2
+ # LUNA 100M β€” Training Configuration
3
+ # Single source of truth for all hyperparameters.
4
+ #
5
+ # auto_config: true β†’ All batch/LR/worker settings are auto-detected from
6
+ # available VRAM / RAM / CPU at runtime. Your values below
7
+ # are used as FALLBACKS only if detection fails.
8
+ #
9
+ # auto_config: false β†’ Every value below is used as-is. Nothing is overridden.
10
+ # Use this when you've already benchmarked and want
11
+ # repeatable, fixed runs.
12
+ # ─────────────────────────────────────────────────────────────────────────────
13
+
14
+ auto_config: true # ← flip to false to lock everything below
15
+
16
+ # ── Data ──────────────────────────────────────────────────────────────────────
17
+ data_path: "Base/data/litdata_pretrain_final" # local default; overridden by --data_path
18
+ out_dir: "out/pretrain/luna-100m"
19
+ tokenizer_dir: "Base/checkpoints/EleutherAI/pythia-160m"
20
+
21
+ # ── Model (fixed for LUNA-100M β€” do not change) ───────────────────────────────
22
+ model:
23
+ vocab_size: 50304 # ceil(50277/128)*128 β€” pythia tokenizer with EOS padding
24
+ seq_len: 1024
25
+ n_layer: 10
26
+ n_embd: 768
27
+ n_head: 12
28
+
29
+ # ── Training budget ───────────────────────────────────────────────────────────
30
+ train:
31
+ max_tokens: 4515286950 # full dataset (verified from index.json, 270 chunks)
32
+ lr_warmup_steps: 500 # [AUTO] scaled to 5% of total_steps if auto_config
33
+ save_interval: 1000 # save checkpoint every N optimizer steps
34
+ log_interval: 10 # print log every N steps
35
+ max_norm: 1.0 # gradient clip norm
36
+
37
+ # ── Optimiser ─────────────────────────────────────────────────────────────────
38
+ optimizer:
39
+ lr: 0.0006 # 6e-4 [AUTO] scaled by sqrt(global_batch/120) if auto_config
40
+ min_lr: 0.00006 # 6e-5
41
+ weight_decay: 0.1
42
+ betas: [0.9, 0.95]
43
+ eps: 1.0e-8
44
+
45
+ # ── Batch sizing ──────────────────────────────────────────────────────────────
46
+ # When auto_config: true β†’ micro_batch and grad_accum are ignored; the script
47
+ # probes VRAM and fills it to ~82% saturation, then
48
+ # computes grad_accum to hit global_batch.
49
+ # When auto_config: false β†’ micro_batch Γ— grad_accum must equal global_batch.
50
+ batch:
51
+ global_batch: 120 # target total samples per optimizer step
52
+ micro_batch: 12 # [MANUAL] samples per GPU forward pass (ignored when auto)
53
+ grad_accum: 10 # [MANUAL] accumulation steps (ignored when auto)
54
+
55
+ # ── DataLoader ────────────────────────────────────────────────────────────────
56
+ # When auto_config: true β†’ num_workers auto = cpu_cores // 2, capped by RAM
57
+ # When auto_config: false β†’ num_workers used as-is
58
+ dataloader:
59
+ num_workers: -1 # -1 = auto; set to 0 to disable multiprocessing
60
+ pin_memory: true # [AUTO] disabled if RAM < 16GB
61
+
62
+ # ── Hardware / precision ──────────────────────────────────────────────────────
63
+ # When auto_config: true β†’ precision detected from GPU compute capability
64
+ # When auto_config: false β†’ use the value below
65
+ hardware:
66
+ precision: "bf16" # bf16 | fp16 | fp32
67
+ compile: true # torch.compile (requires Triton β€” Linux/cloud only)