victor70 commited on
Commit
f6260f0
·
verified ·
1 Parent(s): c8ce961

Add training config

Browse files
Files changed (1) hide show
  1. config.yaml +67 -0
config.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exp7 Phase 1: Data Quality Check (117M + 2B tokens)
2
+ # Validate data quality via loss curves
3
+
4
+ model:
5
+ d_model: 768
6
+ n_layers: 12
7
+ vocab_size: 32000
8
+ n_heads: 12
9
+ n_kv_heads: 3
10
+ ff_mult: 3
11
+ max_seq_len: 1024
12
+
13
+ training:
14
+ # Optimizer
15
+ weight_decay: 0.1
16
+ grad_clip: 1.0
17
+
18
+ # Learning Rate
19
+ peak_lr: 5.0e-4
20
+ min_lr: 5.0e-5
21
+ warmup_steps: 1000
22
+
23
+ # Regularization
24
+ dropout: 0.1
25
+ label_smoothing: 0.05
26
+
27
+ # Batch Size - 8 GPUs: 8 * 8 * 2 = 128 effective batch
28
+ batch_size: 8
29
+ grad_accum_steps: 2
30
+ max_length: 1024
31
+
32
+ # Training - 1.29B tokens / 128 batch / 1024 seq = ~10000 steps
33
+ max_steps: 10000
34
+ save_steps: 2000
35
+ eval_steps: 500
36
+ log_steps: 100
37
+
38
+ # Checkpointing
39
+ gradient_checkpointing: true
40
+
41
+ data:
42
+ path: data/processed_exp7_phase1
43
+ # Mix ratios
44
+ korean_ratio: 0.50
45
+ english_ratio: 0.30
46
+ math_ratio: 0.15
47
+ code_ratio: 0.05
48
+
49
+ tokenizer:
50
+ vocab_size: 32000
51
+ model_type: unigram
52
+ character_coverage: 0.9995
53
+
54
+ distributed:
55
+ enabled: true
56
+ world_size: 8
57
+ backend: nccl
58
+
59
+ # Phase 1 specific settings
60
+ phase:
61
+ name: "data_quality_check"
62
+ total_tokens: "2B"
63
+ analysis:
64
+ - "loss_curve_by_domain"
65
+ - "perplexity_tracking"
66
+ - "data_mix_optimization"
67
+ - "benchmark_evaluation"