nicholasKluge commited on
Commit
0ffd80a
·
verified ·
1 Parent(s): a9d59e6

Create config_stage_1.yaml

Browse files
Files changed (1) hide show
  1. config_stage_1.yaml +102 -0
config_stage_1.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Directory settings
2
+ checkpoint_dir: "/lustre/scratch/data/polyglot_datasets/bengali/checkpoints/models/LilTii/v2"
3
+ train_dataset_dir:
4
+ # Total: ~99B
5
+ # Bengali Text (~40B)
6
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_1" # 5.8B (ben)
7
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_2" # 8.6B (ben)
8
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_3" # 4.2B (ben)
9
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_4" # 1.5B (ben)
10
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_5" # 5.5M (ben)
11
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_1" # 5.8B (ben)
12
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_2" # 8.6B (ben)
13
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_3" # 4.2B (ben)
14
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_4" # 1.5B (ben)
15
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/edu_score_5" # 5.5M (ben)
16
+ # Edu English Text (~35B)
17
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/fineweb_edu/edu_score_3" # 35.B (eng)
18
+ # Reasoning (~14.6B)
19
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/math_meta_reasoning_filtered" # 1.2B (eng)
20
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/math_meta_reasoning_filtered" # 1.2B (eng)
21
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/nvidia_openscience" # 9.8B (eng)
22
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/big_reasoning_traces" # 2.4B (eng)
23
+ # Edu Math Text (~9.5B)
24
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/finemath_34b/edu_score_4" # 8.5B (eng)
25
+ - "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/finemath_34b/edu_score_5" # 1.0B (eng)
26
+ val_dataset_dir: "/lustre/scratch/data/polyglot_datasets/bengali/tokenized/validation_split"
27
+ dataset_type: "parquet"
28
+ cache_dir: "/lustre/mlnvme/data/polyglot/.cache"
29
+
30
+ # Data loading settings
31
+ pin_memory: true
32
+ num_workers_for_dataloader: 32
33
+ shuffle_dataset: true
34
+
35
+ # Model architecture settings
36
+ vocab_size: 49152
37
+ num_hidden_layers: 28
38
+ num_attention_heads: 16
39
+ num_key_value_heads: 8
40
+ head_dim: null
41
+ hidden_size: 1536
42
+ intermediate_size: 3072
43
+ max_position_embeddings: 4096
44
+ tie_word_embeddings: true
45
+ hidden_act: "silu"
46
+ output_hidden_states: false
47
+ attn_implementation: "flash_attention_2"
48
+ use_cache: false
49
+ no_rope_layer_interval: null
50
+ rope_theta: 50000.0
51
+ rope_scale_factor: null
52
+ rms_norm_eps: 0.000001
53
+
54
+ # Training settings
55
+ total_batch_size: 2097152
56
+ micro_batch_size: 16
57
+ eval_micro_batch_size: 8
58
+ num_train_epochs: 1
59
+ warmup_steps: 2000
60
+ max_learning_rate: 0.0007
61
+ min_learning_rate: 0.0
62
+ weight_decay: 0.1
63
+ beta1: 0.9
64
+ beta2: 0.95
65
+ eps: 0.00000001
66
+ lr_decay_type: "wsd"
67
+ lr_decay_iters_coef: 0.0
68
+ seed: 1337
69
+ max_steps: 47500
70
+ max_grad_norm: 1.0
71
+
72
+ # Precision and optimization settings
73
+ torch_compile: false
74
+ mat_mul_precision: "highest"
75
+ tf32: true
76
+ bf16: true
77
+ gradient_checkpointing: false
78
+ use_liger_kernel: true
79
+ static_graph: false
80
+
81
+ # Hub settings
82
+ push_to_hub: false
83
+ hub_token: null
84
+ hub_model_id: null
85
+
86
+ # Tokenizer and Reference model
87
+ tokenizer_name_or_path: "Polygl0t/LilTii-v0.2"
88
+ reference_model: "HuggingFaceTB/SmolLM2-360M"
89
+
90
+ # Checkpoint settings
91
+ resume_from_checkpoint: null
92
+ checkpointing_steps: 2500
93
+ begin_new_stage: false
94
+ stage_name: "Warmup-Stable"
95
+
96
+ # Miscellaneous settings
97
+ sanity_check: false
98
+ sanity_check_num_samples: 100000
99
+ wandb_token: null
100
+ wandb_id: "LilTii-v0.2"
101
+ wandb_project: "Polyglot"
102
+ wandb_desc: "Developing LLMs for low-resource languages"