levossadtchi commited on
Commit
9847679
·
verified ·
1 Parent(s): 0a8c991

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. configs/.ipynb_checkpoints/pretrain_5090_stage1-checkpoint.json +27 -0
  2. configs/.ipynb_checkpoints/pretrain_5090_stage2_anneal-checkpoint.json +27 -0
  3. configs/data_mix_10b.json +56 -0
  4. configs/model_70m.json +17 -0
  5. configs/pretrain_5090_stage1.json +27 -0
  6. configs/pretrain_5090_stage2_anneal.json +27 -0
  7. configs/pretrain_mps_dryrun.json +27 -0
  8. configs/sft_5090.json +26 -0
  9. configs/sft_data_smoltalk.json +12 -0
  10. data/.DS_Store +0 -0
  11. data/README.md +3 -0
  12. data/pretokenized/dataset_summary.json +198 -0
  13. data/pretokenized/logs/prepare_pretrain_data_20260313_091113.log +0 -0
  14. data/pretokenized/train/train_manifest.json +502 -0
  15. data/pretokenized/val/val_manifest.json +7 -0
  16. data/tokenizer/.DS_Store +0 -0
  17. data/tokenizer/logs/train_tokenizer_20260312_114030.log +11 -0
  18. data/tokenizer/tokenizer.json +0 -0
  19. data/tokenizer/tokenizer_meta.json +80 -0
  20. data/tokenizer/tokenizer_summary.json +80 -0
  21. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl +2 -0
  22. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log +10 -0
  23. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl +27 -0
  24. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log +14 -0
  25. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl +27 -0
  26. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log +34 -0
  27. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl +27 -0
  28. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log +34 -0
  29. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl +13 -0
  30. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log +21 -0
  31. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl +61 -0
  32. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log +69 -0
  33. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl +13 -0
  34. outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log +21 -0
  35. outputs/pretrain_mps_dryrun/run_config.json +46 -0
  36. outputs/pretrain_stage1/.ipynb_checkpoints/run_config-checkpoint.json +46 -0
  37. outputs/pretrain_stage1/logs/.ipynb_checkpoints/train_pretrain_20260313_152202-checkpoint.log +82 -0
  38. outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.jsonl +0 -0
  39. outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.log +0 -0
  40. outputs/pretrain_stage1/run_config.json +46 -0
  41. outputs/pretrain_stage2/run_config.json +46 -0
  42. scripts/.DS_Store +0 -0
  43. scripts/eval_perplexity.py +79 -0
  44. scripts/generate.py +81 -0
  45. scripts/prepare_pretrain_data.py +318 -0
  46. scripts/prepare_sft_data.py +221 -0
  47. scripts/train_pretrain.py +405 -0
  48. scripts/train_sft.py +394 -0
  49. scripts/train_tokenizer.py +149 -0
  50. src/.DS_Store +0 -0
configs/.ipynb_checkpoints/pretrain_5090_stage1-checkpoint.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 42,
3
+ "train_dir": "data/pretokenized/train",
4
+ "val_dir": "data/pretokenized/val",
5
+ "output_dir": "outputs/pretrain_stage1",
6
+ "checkpoint_dir": "checkpoints/pretrain_stage1",
7
+ "init_from": null,
8
+ "resume_from": null,
9
+ "seq_len": 2048,
10
+ "micro_batch_size": 8,
11
+ "grad_accum_steps": 32,
12
+ "max_steps": 20000,
13
+ "warmup_steps": 2000,
14
+ "learning_rate": 0.003,
15
+ "min_lr": 0.0003,
16
+ "weight_decay": 0.1,
17
+ "beta1": 0.9,
18
+ "beta2": 0.95,
19
+ "grad_clip": 1.0,
20
+ "precision": "bf16",
21
+ "num_workers": 0,
22
+ "log_interval": 10,
23
+ "eval_interval": 250,
24
+ "eval_batches": 50,
25
+ "save_interval": 100,
26
+ "compile_model": false
27
+ }
configs/.ipynb_checkpoints/pretrain_5090_stage2_anneal-checkpoint.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 42,
3
+ "train_dir": "data/pretokenized/train",
4
+ "val_dir": "data/pretokenized/val",
5
+ "output_dir": "outputs/pretrain_stage2",
6
+ "checkpoint_dir": "checkpoints/pretrain_stage2",
7
+ "init_from": "checkpoints/pretrain_stage1/last.pt",
8
+ "resume_from": null,
9
+ "seq_len": 8192,
10
+ "micro_batch_size": 2,
11
+ "grad_accum_steps": 16,
12
+ "max_steps": 1000,
13
+ "warmup_steps": 100,
14
+ "learning_rate": 0.001,
15
+ "min_lr": 0.0001,
16
+ "weight_decay": 0.1,
17
+ "beta1": 0.9,
18
+ "beta2": 0.95,
19
+ "grad_clip": 1.0,
20
+ "precision": "bf16",
21
+ "num_workers": 0,
22
+ "log_interval": 5,
23
+ "eval_interval": 100,
24
+ "eval_batches": 20,
25
+ "save_interval": 50,
26
+ "compile_model": false
27
+ }
configs/data_mix_10b.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_sample_documents": 2000000,
3
+ "tokenizer_min_frequency": 2,
4
+ "tokenizer_special_tokens": [
5
+ "<pad>",
6
+ "<bos>",
7
+ "<eos>",
8
+ "<unk>"
9
+ ],
10
+ "train_tokens": 10000000000,
11
+ "val_tokens": 20000000,
12
+ "shard_size_tokens": 100000000,
13
+ "sources": [
14
+ {
15
+ "name": "fineweb_edu",
16
+ "path": "HuggingFaceFW/fineweb-edu",
17
+ "config_name": "sample-10BT",
18
+ "split": "train",
19
+ "text_field": "text",
20
+ "weight": 0.6,
21
+ "streaming": true,
22
+ "shuffle_buffer": 10000
23
+ },
24
+ {
25
+ "name": "cosmopedia_v2",
26
+ "path": "HuggingFaceTB/smollm-corpus",
27
+ "config_name": "cosmopedia-v2",
28
+ "split": "train",
29
+ "text_field": "text",
30
+ "weight": 0.2,
31
+ "streaming": true,
32
+ "shuffle_buffer": 10000
33
+ },
34
+ {
35
+ "name": "the_stack_python",
36
+ "path": "bigcode/the-stack-dedup",
37
+ "config_name": null,
38
+ "data_dir": "data/python",
39
+ "split": "train",
40
+ "text_field": "content",
41
+ "weight": 0.1,
42
+ "streaming": true,
43
+ "shuffle_buffer": 2000
44
+ },
45
+ {
46
+ "name": "finemath",
47
+ "path": "HuggingFaceTB/finemath",
48
+ "config_name": "finemath-4plus",
49
+ "split": "train",
50
+ "text_field": "text",
51
+ "weight": 0.1,
52
+ "streaming": true,
53
+ "shuffle_buffer": 5000
54
+ }
55
+ ]
56
+ }
configs/model_70m.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 49152,
3
+ "max_seq_len": 8192,
4
+ "d_model": 384,
5
+ "n_layers": 32,
6
+ "n_heads": 6,
7
+ "ffn_hidden_dim": 1024,
8
+ "rope_theta": 10000.0,
9
+ "rms_norm_eps": 1e-05,
10
+ "initializer_range": 0.02,
11
+ "dropout": 0.0,
12
+ "tie_word_embeddings": true,
13
+ "bias": false,
14
+ "pad_token_id": 0,
15
+ "bos_token_id": 1,
16
+ "eos_token_id": 2
17
+ }
configs/pretrain_5090_stage1.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 42,
3
+ "train_dir": "data/pretokenized/train",
4
+ "val_dir": "data/pretokenized/val",
5
+ "output_dir": "outputs/pretrain_stage1",
6
+ "checkpoint_dir": "checkpoints/pretrain_stage1",
7
+ "init_from": null,
8
+ "resume_from": null,
9
+ "seq_len": 2048,
10
+ "micro_batch_size": 8,
11
+ "grad_accum_steps": 32,
12
+ "max_steps": 20000,
13
+ "warmup_steps": 2000,
14
+ "learning_rate": 0.003,
15
+ "min_lr": 0.0003,
16
+ "weight_decay": 0.1,
17
+ "beta1": 0.9,
18
+ "beta2": 0.95,
19
+ "grad_clip": 1.0,
20
+ "precision": "bf16",
21
+ "num_workers": 0,
22
+ "log_interval": 10,
23
+ "eval_interval": 250,
24
+ "eval_batches": 50,
25
+ "save_interval": 100,
26
+ "compile_model": false
27
+ }
configs/pretrain_5090_stage2_anneal.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 42,
3
+ "train_dir": "data/pretokenized/train",
4
+ "val_dir": "data/pretokenized/val",
5
+ "output_dir": "outputs/pretrain_stage2",
6
+ "checkpoint_dir": "checkpoints/pretrain_stage2",
7
+ "init_from": "checkpoints/pretrain_stage1/last.pt",
8
+ "resume_from": null,
9
+ "seq_len": 8192,
10
+ "micro_batch_size": 2,
11
+ "grad_accum_steps": 16,
12
+ "max_steps": 1000,
13
+ "warmup_steps": 100,
14
+ "learning_rate": 0.001,
15
+ "min_lr": 0.0001,
16
+ "weight_decay": 0.1,
17
+ "beta1": 0.9,
18
+ "beta2": 0.95,
19
+ "grad_clip": 1.0,
20
+ "precision": "bf16",
21
+ "num_workers": 0,
22
+ "log_interval": 5,
23
+ "eval_interval": 100,
24
+ "eval_batches": 20,
25
+ "save_interval": 50,
26
+ "compile_model": false
27
+ }
configs/pretrain_mps_dryrun.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 42,
3
+ "train_dir": "data/pretokenized/train",
4
+ "val_dir": "data/pretokenized/val",
5
+ "output_dir": "outputs/pretrain_mps_dryrun",
6
+ "checkpoint_dir": "checkpoints/pretrain_mps_dryrun",
7
+ "init_from": null,
8
+ "resume_from": null,
9
+ "seq_len": 512,
10
+ "micro_batch_size": 1,
11
+ "grad_accum_steps": 4,
12
+ "max_steps": 500,
13
+ "warmup_steps": 50,
14
+ "learning_rate": 0.001,
15
+ "min_lr": 0.0001,
16
+ "weight_decay": 0.1,
17
+ "beta1": 0.9,
18
+ "beta2": 0.95,
19
+ "grad_clip": 1.0,
20
+ "precision": "fp32",
21
+ "num_workers": 0,
22
+ "log_interval": 1,
23
+ "eval_interval": 10,
24
+ "eval_batches": 2,
25
+ "save_interval": 10,
26
+ "compile_model": false
27
+ }
configs/sft_5090.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 42,
3
+ "dataset_path": "data/sft/processed",
4
+ "output_dir": "outputs/sft",
5
+ "checkpoint_dir": "checkpoints/sft",
6
+ "init_from": "checkpoints/pretrain_stage2/last.pt",
7
+ "resume_from": null,
8
+ "seq_len": 2048,
9
+ "micro_batch_size": 8,
10
+ "grad_accum_steps": 16,
11
+ "max_steps": 5000,
12
+ "warmup_steps": 200,
13
+ "learning_rate": 0.0005,
14
+ "min_lr": 5e-05,
15
+ "weight_decay": 0.01,
16
+ "beta1": 0.9,
17
+ "beta2": 0.95,
18
+ "grad_clip": 1.0,
19
+ "precision": "bf16",
20
+ "num_workers": 0,
21
+ "log_interval": 10,
22
+ "eval_interval": 100,
23
+ "eval_batches": 50,
24
+ "save_interval": 200,
25
+ "compile_model": false
26
+ }
configs/sft_data_smoltalk.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "path": "HuggingFaceTB/smoltalk",
3
+ "config_name": null,
4
+ "split": "train",
5
+ "revision": null,
6
+ "streaming": false,
7
+ "shuffle": true,
8
+ "format": "messages",
9
+ "messages_field": "messages",
10
+ "val_examples": 2000,
11
+ "max_train_examples": 200000
12
+ }
data/.DS_Store ADDED
Binary file (8.2 kB). View file
 
data/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
data/pretokenized/dataset_summary.json ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer": {
3
+ "vocab_size": 49152,
4
+ "special_tokens": {
5
+ "pad_token": "<pad>",
6
+ "bos_token": "<bos>",
7
+ "eos_token": "<eos>",
8
+ "unk_token": "<unk>",
9
+ "pad_token_id": 0,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "unk_token_id": 3
13
+ },
14
+ "data_config": {
15
+ "sources": [
16
+ {
17
+ "name": "fineweb_edu",
18
+ "path": "HuggingFaceFW/fineweb-edu",
19
+ "split": "train",
20
+ "weight": 0.6,
21
+ "text_field": "text",
22
+ "config_name": "sample-10BT",
23
+ "data_dir": null,
24
+ "revision": null,
25
+ "streaming": true,
26
+ "shuffle_buffer": 10000,
27
+ "sample_documents": null
28
+ },
29
+ {
30
+ "name": "cosmopedia_v2",
31
+ "path": "HuggingFaceTB/smollm-corpus",
32
+ "split": "train",
33
+ "weight": 0.2,
34
+ "text_field": "text",
35
+ "config_name": "cosmopedia-v2",
36
+ "data_dir": null,
37
+ "revision": null,
38
+ "streaming": true,
39
+ "shuffle_buffer": 10000,
40
+ "sample_documents": null
41
+ },
42
+ {
43
+ "name": "the_stack_python",
44
+ "path": "bigcode/the-stack-dedup",
45
+ "split": "train",
46
+ "weight": 0.1,
47
+ "text_field": "content",
48
+ "config_name": null,
49
+ "data_dir": "data/python",
50
+ "revision": null,
51
+ "streaming": true,
52
+ "shuffle_buffer": 2000,
53
+ "sample_documents": null
54
+ },
55
+ {
56
+ "name": "finemath",
57
+ "path": "HuggingFaceTB/finemath",
58
+ "split": "train",
59
+ "weight": 0.1,
60
+ "text_field": "text",
61
+ "config_name": "finemath-4plus",
62
+ "data_dir": null,
63
+ "revision": null,
64
+ "streaming": true,
65
+ "shuffle_buffer": 5000,
66
+ "sample_documents": null
67
+ }
68
+ ],
69
+ "tokenizer_sample_documents": 2000000,
70
+ "tokenizer_min_frequency": 2,
71
+ "tokenizer_special_tokens": [
72
+ "<pad>",
73
+ "<bos>",
74
+ "<eos>",
75
+ "<unk>"
76
+ ],
77
+ "train_tokens": 10000000000,
78
+ "val_tokens": 20000000,
79
+ "shard_size_tokens": 100000000
80
+ }
81
+ },
82
+ "data_config": {
83
+ "sources": [
84
+ {
85
+ "name": "fineweb_edu",
86
+ "path": "HuggingFaceFW/fineweb-edu",
87
+ "split": "train",
88
+ "weight": 0.6,
89
+ "text_field": "text",
90
+ "config_name": "sample-10BT",
91
+ "data_dir": null,
92
+ "revision": null,
93
+ "streaming": true,
94
+ "shuffle_buffer": 10000,
95
+ "sample_documents": null
96
+ },
97
+ {
98
+ "name": "cosmopedia_v2",
99
+ "path": "HuggingFaceTB/smollm-corpus",
100
+ "split": "train",
101
+ "weight": 0.2,
102
+ "text_field": "text",
103
+ "config_name": "cosmopedia-v2",
104
+ "data_dir": null,
105
+ "revision": null,
106
+ "streaming": true,
107
+ "shuffle_buffer": 10000,
108
+ "sample_documents": null
109
+ },
110
+ {
111
+ "name": "the_stack_python",
112
+ "path": "bigcode/the-stack-dedup",
113
+ "split": "train",
114
+ "weight": 0.1,
115
+ "text_field": "content",
116
+ "config_name": null,
117
+ "data_dir": "data/python",
118
+ "revision": null,
119
+ "streaming": true,
120
+ "shuffle_buffer": 2000,
121
+ "sample_documents": null
122
+ },
123
+ {
124
+ "name": "finemath",
125
+ "path": "HuggingFaceTB/finemath",
126
+ "split": "train",
127
+ "weight": 0.1,
128
+ "text_field": "text",
129
+ "config_name": "finemath-4plus",
130
+ "data_dir": null,
131
+ "revision": null,
132
+ "streaming": true,
133
+ "shuffle_buffer": 5000,
134
+ "sample_documents": null
135
+ }
136
+ ],
137
+ "tokenizer_sample_documents": 2000000,
138
+ "tokenizer_min_frequency": 2,
139
+ "tokenizer_special_tokens": [
140
+ "<pad>",
141
+ "<bos>",
142
+ "<eos>",
143
+ "<unk>"
144
+ ],
145
+ "train_tokens": 10000000000,
146
+ "val_tokens": 20000000,
147
+ "shard_size_tokens": 100000000
148
+ },
149
+ "mixing_strategy": "global_interleaving_weighted_progress_balancing",
150
+ "train_target_tokens": 10000000000,
151
+ "val_target_tokens": 20000000,
152
+ "train_tokens_written": 10000000000,
153
+ "val_tokens_written": 20000000,
154
+ "train_shards": 100,
155
+ "val_shards": 1,
156
+ "sources": {
157
+ "fineweb_edu": {
158
+ "path": "HuggingFaceFW/fineweb-edu",
159
+ "data_dir": null,
160
+ "split": "train",
161
+ "train_target_tokens": 6000000000,
162
+ "val_target_tokens": 12000000,
163
+ "train_tokens_written": 6000000000,
164
+ "val_tokens_written": 12000000,
165
+ "documents_used": 5922817
166
+ },
167
+ "cosmopedia_v2": {
168
+ "path": "HuggingFaceTB/smollm-corpus",
169
+ "data_dir": null,
170
+ "split": "train",
171
+ "train_target_tokens": 2000000000,
172
+ "val_target_tokens": 4000000,
173
+ "train_tokens_written": 2000000000,
174
+ "val_tokens_written": 4000000,
175
+ "documents_used": 2792704
176
+ },
177
+ "the_stack_python": {
178
+ "path": "bigcode/the-stack-dedup",
179
+ "data_dir": "data/python",
180
+ "split": "train",
181
+ "train_target_tokens": 1000000000,
182
+ "val_target_tokens": 2000000,
183
+ "train_tokens_written": 1000000000,
184
+ "val_tokens_written": 2000000,
185
+ "documents_used": 684540
186
+ },
187
+ "finemath": {
188
+ "path": "HuggingFaceTB/finemath",
189
+ "data_dir": null,
190
+ "split": "train",
191
+ "train_target_tokens": 1000000000,
192
+ "val_target_tokens": 2000000,
193
+ "train_tokens_written": 1000000000,
194
+ "val_tokens_written": 2000000,
195
+ "documents_used": 692367
196
+ }
197
+ }
198
+ }
data/pretokenized/logs/prepare_pretrain_data_20260313_091113.log ADDED
The diff for this file is too large to render. See raw diff
 
data/pretokenized/train/train_manifest.json ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "train_00000.bin",
4
+ "num_tokens": 100000000,
5
+ "dtype": "uint16"
6
+ },
7
+ {
8
+ "path": "train_00001.bin",
9
+ "num_tokens": 100000000,
10
+ "dtype": "uint16"
11
+ },
12
+ {
13
+ "path": "train_00002.bin",
14
+ "num_tokens": 100000000,
15
+ "dtype": "uint16"
16
+ },
17
+ {
18
+ "path": "train_00003.bin",
19
+ "num_tokens": 100000000,
20
+ "dtype": "uint16"
21
+ },
22
+ {
23
+ "path": "train_00004.bin",
24
+ "num_tokens": 100000000,
25
+ "dtype": "uint16"
26
+ },
27
+ {
28
+ "path": "train_00005.bin",
29
+ "num_tokens": 100000000,
30
+ "dtype": "uint16"
31
+ },
32
+ {
33
+ "path": "train_00006.bin",
34
+ "num_tokens": 100000000,
35
+ "dtype": "uint16"
36
+ },
37
+ {
38
+ "path": "train_00007.bin",
39
+ "num_tokens": 100000000,
40
+ "dtype": "uint16"
41
+ },
42
+ {
43
+ "path": "train_00008.bin",
44
+ "num_tokens": 100000000,
45
+ "dtype": "uint16"
46
+ },
47
+ {
48
+ "path": "train_00009.bin",
49
+ "num_tokens": 100000000,
50
+ "dtype": "uint16"
51
+ },
52
+ {
53
+ "path": "train_00010.bin",
54
+ "num_tokens": 100000000,
55
+ "dtype": "uint16"
56
+ },
57
+ {
58
+ "path": "train_00011.bin",
59
+ "num_tokens": 100000000,
60
+ "dtype": "uint16"
61
+ },
62
+ {
63
+ "path": "train_00012.bin",
64
+ "num_tokens": 100000000,
65
+ "dtype": "uint16"
66
+ },
67
+ {
68
+ "path": "train_00013.bin",
69
+ "num_tokens": 100000000,
70
+ "dtype": "uint16"
71
+ },
72
+ {
73
+ "path": "train_00014.bin",
74
+ "num_tokens": 100000000,
75
+ "dtype": "uint16"
76
+ },
77
+ {
78
+ "path": "train_00015.bin",
79
+ "num_tokens": 100000000,
80
+ "dtype": "uint16"
81
+ },
82
+ {
83
+ "path": "train_00016.bin",
84
+ "num_tokens": 100000000,
85
+ "dtype": "uint16"
86
+ },
87
+ {
88
+ "path": "train_00017.bin",
89
+ "num_tokens": 100000000,
90
+ "dtype": "uint16"
91
+ },
92
+ {
93
+ "path": "train_00018.bin",
94
+ "num_tokens": 100000000,
95
+ "dtype": "uint16"
96
+ },
97
+ {
98
+ "path": "train_00019.bin",
99
+ "num_tokens": 100000000,
100
+ "dtype": "uint16"
101
+ },
102
+ {
103
+ "path": "train_00020.bin",
104
+ "num_tokens": 100000000,
105
+ "dtype": "uint16"
106
+ },
107
+ {
108
+ "path": "train_00021.bin",
109
+ "num_tokens": 100000000,
110
+ "dtype": "uint16"
111
+ },
112
+ {
113
+ "path": "train_00022.bin",
114
+ "num_tokens": 100000000,
115
+ "dtype": "uint16"
116
+ },
117
+ {
118
+ "path": "train_00023.bin",
119
+ "num_tokens": 100000000,
120
+ "dtype": "uint16"
121
+ },
122
+ {
123
+ "path": "train_00024.bin",
124
+ "num_tokens": 100000000,
125
+ "dtype": "uint16"
126
+ },
127
+ {
128
+ "path": "train_00025.bin",
129
+ "num_tokens": 100000000,
130
+ "dtype": "uint16"
131
+ },
132
+ {
133
+ "path": "train_00026.bin",
134
+ "num_tokens": 100000000,
135
+ "dtype": "uint16"
136
+ },
137
+ {
138
+ "path": "train_00027.bin",
139
+ "num_tokens": 100000000,
140
+ "dtype": "uint16"
141
+ },
142
+ {
143
+ "path": "train_00028.bin",
144
+ "num_tokens": 100000000,
145
+ "dtype": "uint16"
146
+ },
147
+ {
148
+ "path": "train_00029.bin",
149
+ "num_tokens": 100000000,
150
+ "dtype": "uint16"
151
+ },
152
+ {
153
+ "path": "train_00030.bin",
154
+ "num_tokens": 100000000,
155
+ "dtype": "uint16"
156
+ },
157
+ {
158
+ "path": "train_00031.bin",
159
+ "num_tokens": 100000000,
160
+ "dtype": "uint16"
161
+ },
162
+ {
163
+ "path": "train_00032.bin",
164
+ "num_tokens": 100000000,
165
+ "dtype": "uint16"
166
+ },
167
+ {
168
+ "path": "train_00033.bin",
169
+ "num_tokens": 100000000,
170
+ "dtype": "uint16"
171
+ },
172
+ {
173
+ "path": "train_00034.bin",
174
+ "num_tokens": 100000000,
175
+ "dtype": "uint16"
176
+ },
177
+ {
178
+ "path": "train_00035.bin",
179
+ "num_tokens": 100000000,
180
+ "dtype": "uint16"
181
+ },
182
+ {
183
+ "path": "train_00036.bin",
184
+ "num_tokens": 100000000,
185
+ "dtype": "uint16"
186
+ },
187
+ {
188
+ "path": "train_00037.bin",
189
+ "num_tokens": 100000000,
190
+ "dtype": "uint16"
191
+ },
192
+ {
193
+ "path": "train_00038.bin",
194
+ "num_tokens": 100000000,
195
+ "dtype": "uint16"
196
+ },
197
+ {
198
+ "path": "train_00039.bin",
199
+ "num_tokens": 100000000,
200
+ "dtype": "uint16"
201
+ },
202
+ {
203
+ "path": "train_00040.bin",
204
+ "num_tokens": 100000000,
205
+ "dtype": "uint16"
206
+ },
207
+ {
208
+ "path": "train_00041.bin",
209
+ "num_tokens": 100000000,
210
+ "dtype": "uint16"
211
+ },
212
+ {
213
+ "path": "train_00042.bin",
214
+ "num_tokens": 100000000,
215
+ "dtype": "uint16"
216
+ },
217
+ {
218
+ "path": "train_00043.bin",
219
+ "num_tokens": 100000000,
220
+ "dtype": "uint16"
221
+ },
222
+ {
223
+ "path": "train_00044.bin",
224
+ "num_tokens": 100000000,
225
+ "dtype": "uint16"
226
+ },
227
+ {
228
+ "path": "train_00045.bin",
229
+ "num_tokens": 100000000,
230
+ "dtype": "uint16"
231
+ },
232
+ {
233
+ "path": "train_00046.bin",
234
+ "num_tokens": 100000000,
235
+ "dtype": "uint16"
236
+ },
237
+ {
238
+ "path": "train_00047.bin",
239
+ "num_tokens": 100000000,
240
+ "dtype": "uint16"
241
+ },
242
+ {
243
+ "path": "train_00048.bin",
244
+ "num_tokens": 100000000,
245
+ "dtype": "uint16"
246
+ },
247
+ {
248
+ "path": "train_00049.bin",
249
+ "num_tokens": 100000000,
250
+ "dtype": "uint16"
251
+ },
252
+ {
253
+ "path": "train_00050.bin",
254
+ "num_tokens": 100000000,
255
+ "dtype": "uint16"
256
+ },
257
+ {
258
+ "path": "train_00051.bin",
259
+ "num_tokens": 100000000,
260
+ "dtype": "uint16"
261
+ },
262
+ {
263
+ "path": "train_00052.bin",
264
+ "num_tokens": 100000000,
265
+ "dtype": "uint16"
266
+ },
267
+ {
268
+ "path": "train_00053.bin",
269
+ "num_tokens": 100000000,
270
+ "dtype": "uint16"
271
+ },
272
+ {
273
+ "path": "train_00054.bin",
274
+ "num_tokens": 100000000,
275
+ "dtype": "uint16"
276
+ },
277
+ {
278
+ "path": "train_00055.bin",
279
+ "num_tokens": 100000000,
280
+ "dtype": "uint16"
281
+ },
282
+ {
283
+ "path": "train_00056.bin",
284
+ "num_tokens": 100000000,
285
+ "dtype": "uint16"
286
+ },
287
+ {
288
+ "path": "train_00057.bin",
289
+ "num_tokens": 100000000,
290
+ "dtype": "uint16"
291
+ },
292
+ {
293
+ "path": "train_00058.bin",
294
+ "num_tokens": 100000000,
295
+ "dtype": "uint16"
296
+ },
297
+ {
298
+ "path": "train_00059.bin",
299
+ "num_tokens": 100000000,
300
+ "dtype": "uint16"
301
+ },
302
+ {
303
+ "path": "train_00060.bin",
304
+ "num_tokens": 100000000,
305
+ "dtype": "uint16"
306
+ },
307
+ {
308
+ "path": "train_00061.bin",
309
+ "num_tokens": 100000000,
310
+ "dtype": "uint16"
311
+ },
312
+ {
313
+ "path": "train_00062.bin",
314
+ "num_tokens": 100000000,
315
+ "dtype": "uint16"
316
+ },
317
+ {
318
+ "path": "train_00063.bin",
319
+ "num_tokens": 100000000,
320
+ "dtype": "uint16"
321
+ },
322
+ {
323
+ "path": "train_00064.bin",
324
+ "num_tokens": 100000000,
325
+ "dtype": "uint16"
326
+ },
327
+ {
328
+ "path": "train_00065.bin",
329
+ "num_tokens": 100000000,
330
+ "dtype": "uint16"
331
+ },
332
+ {
333
+ "path": "train_00066.bin",
334
+ "num_tokens": 100000000,
335
+ "dtype": "uint16"
336
+ },
337
+ {
338
+ "path": "train_00067.bin",
339
+ "num_tokens": 100000000,
340
+ "dtype": "uint16"
341
+ },
342
+ {
343
+ "path": "train_00068.bin",
344
+ "num_tokens": 100000000,
345
+ "dtype": "uint16"
346
+ },
347
+ {
348
+ "path": "train_00069.bin",
349
+ "num_tokens": 100000000,
350
+ "dtype": "uint16"
351
+ },
352
+ {
353
+ "path": "train_00070.bin",
354
+ "num_tokens": 100000000,
355
+ "dtype": "uint16"
356
+ },
357
+ {
358
+ "path": "train_00071.bin",
359
+ "num_tokens": 100000000,
360
+ "dtype": "uint16"
361
+ },
362
+ {
363
+ "path": "train_00072.bin",
364
+ "num_tokens": 100000000,
365
+ "dtype": "uint16"
366
+ },
367
+ {
368
+ "path": "train_00073.bin",
369
+ "num_tokens": 100000000,
370
+ "dtype": "uint16"
371
+ },
372
+ {
373
+ "path": "train_00074.bin",
374
+ "num_tokens": 100000000,
375
+ "dtype": "uint16"
376
+ },
377
+ {
378
+ "path": "train_00075.bin",
379
+ "num_tokens": 100000000,
380
+ "dtype": "uint16"
381
+ },
382
+ {
383
+ "path": "train_00076.bin",
384
+ "num_tokens": 100000000,
385
+ "dtype": "uint16"
386
+ },
387
+ {
388
+ "path": "train_00077.bin",
389
+ "num_tokens": 100000000,
390
+ "dtype": "uint16"
391
+ },
392
+ {
393
+ "path": "train_00078.bin",
394
+ "num_tokens": 100000000,
395
+ "dtype": "uint16"
396
+ },
397
+ {
398
+ "path": "train_00079.bin",
399
+ "num_tokens": 100000000,
400
+ "dtype": "uint16"
401
+ },
402
+ {
403
+ "path": "train_00080.bin",
404
+ "num_tokens": 100000000,
405
+ "dtype": "uint16"
406
+ },
407
+ {
408
+ "path": "train_00081.bin",
409
+ "num_tokens": 100000000,
410
+ "dtype": "uint16"
411
+ },
412
+ {
413
+ "path": "train_00082.bin",
414
+ "num_tokens": 100000000,
415
+ "dtype": "uint16"
416
+ },
417
+ {
418
+ "path": "train_00083.bin",
419
+ "num_tokens": 100000000,
420
+ "dtype": "uint16"
421
+ },
422
+ {
423
+ "path": "train_00084.bin",
424
+ "num_tokens": 100000000,
425
+ "dtype": "uint16"
426
+ },
427
+ {
428
+ "path": "train_00085.bin",
429
+ "num_tokens": 100000000,
430
+ "dtype": "uint16"
431
+ },
432
+ {
433
+ "path": "train_00086.bin",
434
+ "num_tokens": 100000000,
435
+ "dtype": "uint16"
436
+ },
437
+ {
438
+ "path": "train_00087.bin",
439
+ "num_tokens": 100000000,
440
+ "dtype": "uint16"
441
+ },
442
+ {
443
+ "path": "train_00088.bin",
444
+ "num_tokens": 100000000,
445
+ "dtype": "uint16"
446
+ },
447
+ {
448
+ "path": "train_00089.bin",
449
+ "num_tokens": 100000000,
450
+ "dtype": "uint16"
451
+ },
452
+ {
453
+ "path": "train_00090.bin",
454
+ "num_tokens": 100000000,
455
+ "dtype": "uint16"
456
+ },
457
+ {
458
+ "path": "train_00091.bin",
459
+ "num_tokens": 100000000,
460
+ "dtype": "uint16"
461
+ },
462
+ {
463
+ "path": "train_00092.bin",
464
+ "num_tokens": 100000000,
465
+ "dtype": "uint16"
466
+ },
467
+ {
468
+ "path": "train_00093.bin",
469
+ "num_tokens": 100000000,
470
+ "dtype": "uint16"
471
+ },
472
+ {
473
+ "path": "train_00094.bin",
474
+ "num_tokens": 100000000,
475
+ "dtype": "uint16"
476
+ },
477
+ {
478
+ "path": "train_00095.bin",
479
+ "num_tokens": 100000000,
480
+ "dtype": "uint16"
481
+ },
482
+ {
483
+ "path": "train_00096.bin",
484
+ "num_tokens": 100000000,
485
+ "dtype": "uint16"
486
+ },
487
+ {
488
+ "path": "train_00097.bin",
489
+ "num_tokens": 100000000,
490
+ "dtype": "uint16"
491
+ },
492
+ {
493
+ "path": "train_00098.bin",
494
+ "num_tokens": 100000000,
495
+ "dtype": "uint16"
496
+ },
497
+ {
498
+ "path": "train_00099.bin",
499
+ "num_tokens": 100000000,
500
+ "dtype": "uint16"
501
+ }
502
+ ]
data/pretokenized/val/val_manifest.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "path": "val_00000.bin",
4
+ "num_tokens": 20000000,
5
+ "dtype": "uint16"
6
+ }
7
+ ]
data/tokenizer/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data/tokenizer/logs/train_tokenizer_20260312_114030.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-12 11:40:30,043 | INFO | Tokenizer training started
2
+ 2026-03-12 11:40:30,044 | INFO | Log file: data/tokenizer/logs/train_tokenizer_20260312_114030.log
3
+ 2026-03-12 11:40:30,044 | INFO | Arguments | data_config=configs/data_mix_10b.json output_dir=data/tokenizer vocab_size=49152 seed=42
4
+ 2026-03-12 11:40:30,044 | INFO | Tokenizer config | sample_documents=2,000,000 min_frequency=2 special_tokens=['<pad>', '<bos>', '<eos>', '<unk>'] num_sources=4
5
+ 2026-03-12 11:40:30,044 | INFO | Tokenizer source start | name=fineweb_edu path=HuggingFaceFW/fineweb-edu data_dir=None split=train text_field=text limit_docs=1,200,000 streaming=True
6
+ 2026-03-12 11:51:35,669 | INFO | Tokenizer source start | name=cosmopedia_v2 path=HuggingFaceTB/smollm-corpus data_dir=None split=train text_field=text limit_docs=400,000 streaming=True
7
+ 2026-03-12 11:55:58,013 | INFO | Tokenizer source start | name=the_stack_python path=bigcode/the-stack-dedup data_dir=data/python split=train text_field=content limit_docs=200,000 streaming=True
8
+ 2026-03-12 12:00:03,620 | INFO | Tokenizer source start | name=finemath path=HuggingFaceTB/finemath data_dir=None split=train text_field=text limit_docs=200,000 streaming=True
9
+ 2026-03-12 12:08:46,619 | INFO | Tokenizer saved | path=data/tokenizer/tokenizer.json
10
+ 2026-03-12 12:08:46,630 | INFO | Tokenizer summary | vocab_size=49152 pad_id=0 bos_id=1 eos_id=2 unk_id=3
11
+ 2026-03-12 12:08:46,630 | INFO | Tokenizer metadata saved | path=data/tokenizer/tokenizer_meta.json
data/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
data/tokenizer/tokenizer_meta.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 49152,
3
+ "special_tokens": {
4
+ "pad_token": "<pad>",
5
+ "bos_token": "<bos>",
6
+ "eos_token": "<eos>",
7
+ "unk_token": "<unk>",
8
+ "pad_token_id": 0,
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 2,
11
+ "unk_token_id": 3
12
+ },
13
+ "data_config": {
14
+ "sources": [
15
+ {
16
+ "name": "fineweb_edu",
17
+ "path": "HuggingFaceFW/fineweb-edu",
18
+ "split": "train",
19
+ "weight": 0.6,
20
+ "text_field": "text",
21
+ "config_name": "sample-10BT",
22
+ "data_dir": null,
23
+ "revision": null,
24
+ "streaming": true,
25
+ "shuffle_buffer": 10000,
26
+ "sample_documents": null
27
+ },
28
+ {
29
+ "name": "cosmopedia_v2",
30
+ "path": "HuggingFaceTB/smollm-corpus",
31
+ "split": "train",
32
+ "weight": 0.2,
33
+ "text_field": "text",
34
+ "config_name": "cosmopedia-v2",
35
+ "data_dir": null,
36
+ "revision": null,
37
+ "streaming": true,
38
+ "shuffle_buffer": 10000,
39
+ "sample_documents": null
40
+ },
41
+ {
42
+ "name": "the_stack_python",
43
+ "path": "bigcode/the-stack-dedup",
44
+ "split": "train",
45
+ "weight": 0.1,
46
+ "text_field": "content",
47
+ "config_name": null,
48
+ "data_dir": "data/python",
49
+ "revision": null,
50
+ "streaming": true,
51
+ "shuffle_buffer": 2000,
52
+ "sample_documents": null
53
+ },
54
+ {
55
+ "name": "finemath",
56
+ "path": "HuggingFaceTB/finemath",
57
+ "split": "train",
58
+ "weight": 0.1,
59
+ "text_field": "text",
60
+ "config_name": "finemath-4plus",
61
+ "data_dir": null,
62
+ "revision": null,
63
+ "streaming": true,
64
+ "shuffle_buffer": 5000,
65
+ "sample_documents": null
66
+ }
67
+ ],
68
+ "tokenizer_sample_documents": 2000000,
69
+ "tokenizer_min_frequency": 2,
70
+ "tokenizer_special_tokens": [
71
+ "<pad>",
72
+ "<bos>",
73
+ "<eos>",
74
+ "<unk>"
75
+ ],
76
+ "train_tokens": 10000000000,
77
+ "val_tokens": 20000000,
78
+ "shard_size_tokens": 100000000
79
+ }
80
+ }
data/tokenizer/tokenizer_summary.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 49152,
3
+ "special_tokens": {
4
+ "pad_token": "<pad>",
5
+ "bos_token": "<bos>",
6
+ "eos_token": "<eos>",
7
+ "unk_token": "<unk>",
8
+ "pad_token_id": 0,
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 2,
11
+ "unk_token_id": 3
12
+ },
13
+ "data_config": {
14
+ "sources": [
15
+ {
16
+ "name": "fineweb_edu",
17
+ "path": "HuggingFaceFW/fineweb-edu",
18
+ "split": "train",
19
+ "weight": 0.6,
20
+ "text_field": "text",
21
+ "config_name": "sample-10BT",
22
+ "data_dir": null,
23
+ "revision": null,
24
+ "streaming": true,
25
+ "shuffle_buffer": 10000,
26
+ "sample_documents": null
27
+ },
28
+ {
29
+ "name": "cosmopedia_v2",
30
+ "path": "HuggingFaceTB/smollm-corpus",
31
+ "split": "train",
32
+ "weight": 0.2,
33
+ "text_field": "text",
34
+ "config_name": "cosmopedia-v2",
35
+ "data_dir": null,
36
+ "revision": null,
37
+ "streaming": true,
38
+ "shuffle_buffer": 10000,
39
+ "sample_documents": null
40
+ },
41
+ {
42
+ "name": "the_stack_python",
43
+ "path": "bigcode/the-stack-dedup",
44
+ "split": "train",
45
+ "weight": 0.1,
46
+ "text_field": "content",
47
+ "config_name": null,
48
+ "data_dir": "data/python",
49
+ "revision": null,
50
+ "streaming": true,
51
+ "shuffle_buffer": 2000,
52
+ "sample_documents": null
53
+ },
54
+ {
55
+ "name": "finemath",
56
+ "path": "HuggingFaceTB/finemath",
57
+ "split": "train",
58
+ "weight": 0.1,
59
+ "text_field": "text",
60
+ "config_name": "finemath-4plus",
61
+ "data_dir": null,
62
+ "revision": null,
63
+ "streaming": true,
64
+ "shuffle_buffer": 5000,
65
+ "sample_documents": null
66
+ }
67
+ ],
68
+ "tokenizer_sample_documents": 2000000,
69
+ "tokenizer_min_frequency": 2,
70
+ "tokenizer_special_tokens": [
71
+ "<pad>",
72
+ "<bos>",
73
+ "<eos>",
74
+ "<unk>"
75
+ ],
76
+ "train_tokens": 10000000000,
77
+ "val_tokens": 20000000,
78
+ "shard_size_tokens": 100000000
79
+ }
80
+ }
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"event": "run_started", "timestamp": "2026-03-13T14:07:30", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
2
+ {"event": "runtime_summary", "timestamp": "2026-03-13T14:07:34", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 14:07:30,831 | INFO | Pretraining started
2
+ 2026-03-13 14:07:30,832 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log
3
+ 2026-03-13 14:07:30,832 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl
4
+ 2026-03-13 14:07:30,832 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
5
+ 2026-03-13 14:07:30,832 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 14:07:30,832 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
7
+ 2026-03-13 14:07:34,596 | INFO | Device summary | device=mps precision=fp32 compile_model=False
8
+ 2026-03-13 14:07:34,597 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 14:07:34,597 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
10
+ 2026-03-13 14:07:34,597 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"event": "run_started", "timestamp": "2026-03-13T14:09:07", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
2
+ {"event": "runtime_summary", "timestamp": "2026-03-13T14:09:10", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
3
+ {"event": "train", "timestamp": "2026-03-13T14:09:12", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 961.8014053409653, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 2.1293377080000937, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
4
+ {"event": "train", "timestamp": "2026-03-13T14:09:13", "step": 2, "loss": 10.763139724731445, "lr": 0.0004, "tok_per_sec": 1605.8989070685525, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 1.275298208987806, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
5
+ {"event": "train", "timestamp": "2026-03-13T14:09:14", "step": 3, "loss": 10.356749057769775, "lr": 0.0006000000000000001, "tok_per_sec": 2736.6722939747565, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7483541250112467, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
6
+ {"event": "train", "timestamp": "2026-03-13T14:09:14", "step": 4, "loss": 10.376826286315918, "lr": 0.0008, "tok_per_sec": 2756.3866613090086, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7430017089936882, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
7
+ {"event": "train", "timestamp": "2026-03-13T14:09:15", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2715.4393876891622, "grad_norm": 12.64534854888916, "tokens_seen": 10240, "elapsed_sec": 0.754205750010442, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
8
+ {"event": "train", "timestamp": "2026-03-13T14:09:16", "step": 6, "loss": 9.938905477523804, "lr": 0.001, "tok_per_sec": 2725.141290121042, "grad_norm": 1.7282862663269043, "tokens_seen": 12288, "elapsed_sec": 0.7515206669922918, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
9
+ {"event": "train", "timestamp": "2026-03-13T14:09:17", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2738.185267024283, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7479406250058673, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
10
+ {"event": "train", "timestamp": "2026-03-13T14:09:17", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2751.8910854624123, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7442154999589548, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
11
+ {"event": "train", "timestamp": "2026-03-13T14:09:18", "step": 9, "loss": 9.267512798309326, "lr": 0.0009140576474687263, "tok_per_sec": 2708.5891084687337, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.7561132080154493, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
12
+ {"event": "train", "timestamp": "2026-03-13T14:09:19", "step": 10, "loss": 8.833673238754272, "lr": 0.0008511087728614862, "tok_per_sec": 2765.4777296002535, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.7405592090217397, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
13
+ {"event": "eval", "timestamp": "2026-03-13T14:09:19", "step": 10, "val_loss": 9.096094608306885, "perplexity": 8920.386982370957, "eval_batches": 2}
14
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:09:33", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
15
+ {"event": "train", "timestamp": "2026-03-13T14:09:34", "step": 11, "loss": 8.795855522155762, "lr": 0.0007750000000000001, "tok_per_sec": 135.3521374189279, "grad_norm": 1.4899625778198242, "tokens_seen": 22528, "elapsed_sec": 15.130902540986426, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
16
+ {"event": "train", "timestamp": "2026-03-13T14:09:35", "step": 12, "loss": 8.558577060699463, "lr": 0.0006890576474687264, "tok_per_sec": 2659.660815260197, "grad_norm": 1.5879555940628052, "tokens_seen": 24576, "elapsed_sec": 0.7700230000191368, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
17
+ {"event": "train", "timestamp": "2026-03-13T14:09:36", "step": 13, "loss": 8.595118284225464, "lr": 0.0005970378084704442, "tok_per_sec": 2709.453151326185, "grad_norm": 1.3136154413223267, "tokens_seen": 26624, "elapsed_sec": 0.7558720840024762, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
18
+ {"event": "train", "timestamp": "2026-03-13T14:09:36", "step": 14, "loss": 8.341074705123901, "lr": 0.000502962191529556, "tok_per_sec": 2569.4064364370934, "grad_norm": 1.2977045774459839, "tokens_seen": 28672, "elapsed_sec": 0.7970712499809451, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
19
+ {"event": "train", "timestamp": "2026-03-13T14:09:37", "step": 15, "loss": 8.268006086349487, "lr": 0.0004109423525312737, "tok_per_sec": 2272.894269158833, "grad_norm": 1.197304368019104, "tokens_seen": 30720, "elapsed_sec": 0.9010537919821218, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
20
+ {"event": "train", "timestamp": "2026-03-13T14:09:38", "step": 16, "loss": 8.30242395401001, "lr": 0.0003250000000000001, "tok_per_sec": 2636.6892030166064, "grad_norm": 1.1259286403656006, "tokens_seen": 32768, "elapsed_sec": 0.7767316669924185, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
21
+ {"event": "train", "timestamp": "2026-03-13T14:09:39", "step": 17, "loss": 8.6144118309021, "lr": 0.00024889122713851394, "tok_per_sec": 2628.5409836443887, "grad_norm": 0.9170812368392944, "tokens_seen": 34816, "elapsed_sec": 0.7791394590167329, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
22
+ {"event": "train", "timestamp": "2026-03-13T14:09:40", "step": 18, "loss": 8.439870119094849, "lr": 0.00018594235253127368, "tok_per_sec": 2699.6840414438493, "grad_norm": 1.6393400430679321, "tokens_seen": 36864, "elapsed_sec": 0.758607292023953, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
23
+ {"event": "train", "timestamp": "2026-03-13T14:09:40", "step": 19, "loss": 7.915311574935913, "lr": 0.00013890454406082956, "tok_per_sec": 2709.5957985032933, "grad_norm": 1.111694097518921, "tokens_seen": 38912, "elapsed_sec": 0.755832290975377, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
24
+ {"event": "train", "timestamp": "2026-03-13T14:09:41", "step": 20, "loss": 7.964773654937744, "lr": 0.00010983357966978745, "tok_per_sec": 2689.7526879403435, "grad_norm": 1.00663423538208, "tokens_seen": 40960, "elapsed_sec": 0.7614082919899374, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
25
+ {"event": "eval", "timestamp": "2026-03-13T14:09:41", "step": 20, "val_loss": 8.758275032043457, "perplexity": 6363.125917448135, "eval_batches": 2}
26
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:09:54", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
27
+ {"event": "run_finished", "timestamp": "2026-03-13T14:09:54", "final_step": 20, "tokens_seen": 40960}
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 14:09:07,112 | INFO | Pretraining started
2
+ 2026-03-13 14:09:07,112 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log
3
+ 2026-03-13 14:09:07,112 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl
4
+ 2026-03-13 14:09:07,112 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
5
+ 2026-03-13 14:09:07,112 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 14:09:07,112 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
7
+ 2026-03-13 14:09:10,064 | INFO | Device summary | device=mps precision=fp32 compile_model=False
8
+ 2026-03-13 14:09:10,065 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 14:09:10,065 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
10
+ 2026-03-13 14:09:10,065 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
11
+ 2026-03-13 14:09:19,703 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.39
12
+ 2026-03-13 14:09:33,612 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
13
+ 2026-03-13 14:09:41,833 | INFO | Eval step | step=20 val_loss=8.7583 perplexity=6363.13
14
+ 2026-03-13 14:09:54,172 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"event": "run_started", "timestamp": "2026-03-13T14:12:24", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
2
+ {"event": "runtime_summary", "timestamp": "2026-03-13T14:12:27", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
3
+ {"event": "train", "timestamp": "2026-03-13T14:12:28", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 1572.3567196374945, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.3025034169550054, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
4
+ {"event": "train", "timestamp": "2026-03-13T14:12:29", "step": 2, "loss": 10.763139724731445, "lr": 0.0004, "tok_per_sec": 1953.4684072997784, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 1.0483916670200415, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
5
+ {"event": "train", "timestamp": "2026-03-13T14:12:30", "step": 3, "loss": 10.356749057769775, "lr": 0.0006000000000000001, "tok_per_sec": 2746.91589368826, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7455634170328267, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
6
+ {"event": "train", "timestamp": "2026-03-13T14:12:31", "step": 4, "loss": 10.376826524734497, "lr": 0.0008, "tok_per_sec": 2765.8319145818245, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7404643750051036, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
7
+ {"event": "train", "timestamp": "2026-03-13T14:12:32", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2769.4683023376706, "grad_norm": 12.645360946655273, "tokens_seen": 10240, "elapsed_sec": 0.7394921249942854, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
8
+ {"event": "train", "timestamp": "2026-03-13T14:12:32", "step": 6, "loss": 9.938905715942383, "lr": 0.001, "tok_per_sec": 2767.895009558497, "grad_norm": 1.7282859086990356, "tokens_seen": 12288, "elapsed_sec": 0.7399124579969794, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
9
+ {"event": "train", "timestamp": "2026-03-13T14:12:33", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2751.478083715354, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7443272080272436, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
10
+ {"event": "train", "timestamp": "2026-03-13T14:12:34", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2761.168994935579, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7417148330132477, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
11
+ {"event": "train", "timestamp": "2026-03-13T14:12:34", "step": 9, "loss": 9.267512798309326, "lr": 0.0009140576474687263, "tok_per_sec": 2775.2087673169717, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.737962500017602, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
12
+ {"event": "train", "timestamp": "2026-03-13T14:12:35", "step": 10, "loss": 8.833673000335693, "lr": 0.0008511087728614862, "tok_per_sec": 2743.4623249730666, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.74650195898721, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
13
+ {"event": "eval", "timestamp": "2026-03-13T14:12:35", "step": 10, "val_loss": 9.096094131469727, "perplexity": 8920.382728799992, "eval_batches": 2}
14
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:12:53", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
15
+ {"event": "train", "timestamp": "2026-03-13T14:12:55", "step": 11, "loss": 8.79585576057434, "lr": 0.0007750000000000001, "tok_per_sec": 106.21275007147676, "grad_norm": 1.4899623394012451, "tokens_seen": 22528, "elapsed_sec": 19.282054166018497, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
16
+ {"event": "train", "timestamp": "2026-03-13T14:12:55", "step": 12, "loss": 8.558577299118042, "lr": 0.0006890576474687264, "tok_per_sec": 2617.334622430896, "grad_norm": 1.5879555940628052, "tokens_seen": 24576, "elapsed_sec": 0.7824754169560038, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
17
+ {"event": "train", "timestamp": "2026-03-13T14:12:56", "step": 13, "loss": 8.595118045806885, "lr": 0.0005970378084704442, "tok_per_sec": 2577.6688468954194, "grad_norm": 1.3136155605316162, "tokens_seen": 26624, "elapsed_sec": 0.7945163330296054, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
18
+ {"event": "train", "timestamp": "2026-03-13T14:12:57", "step": 14, "loss": 8.341074705123901, "lr": 0.000502962191529556, "tok_per_sec": 2652.6070721156225, "grad_norm": 1.2977045774459839, "tokens_seen": 28672, "elapsed_sec": 0.7720706249820068, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
19
+ {"event": "train", "timestamp": "2026-03-13T14:12:58", "step": 15, "loss": 8.268006086349487, "lr": 0.0004109423525312737, "tok_per_sec": 2697.354489927494, "grad_norm": 1.1973044872283936, "tokens_seen": 30720, "elapsed_sec": 0.7592624579556286, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
20
+ {"event": "train", "timestamp": "2026-03-13T14:12:58", "step": 16, "loss": 8.302424192428589, "lr": 0.0003250000000000001, "tok_per_sec": 2578.0945986776087, "grad_norm": 1.1259286403656006, "tokens_seen": 32768, "elapsed_sec": 0.794385124987457, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
21
+ {"event": "train", "timestamp": "2026-03-13T14:12:59", "step": 17, "loss": 8.61441159248352, "lr": 0.00024889122713851394, "tok_per_sec": 2190.790230266318, "grad_norm": 0.9170812368392944, "tokens_seen": 34816, "elapsed_sec": 0.9348224999848753, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
22
+ {"event": "train", "timestamp": "2026-03-13T14:13:00", "step": 18, "loss": 8.439870119094849, "lr": 0.00018594235253127368, "tok_per_sec": 2557.2677193427544, "grad_norm": 1.6393400430679321, "tokens_seen": 36864, "elapsed_sec": 0.8008547499775887, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
23
+ {"event": "train", "timestamp": "2026-03-13T14:13:01", "step": 19, "loss": 7.915311813354492, "lr": 0.00013890454406082956, "tok_per_sec": 2489.488983600405, "grad_norm": 1.111694097518921, "tokens_seen": 38912, "elapsed_sec": 0.8226587920216843, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
24
+ {"event": "train", "timestamp": "2026-03-13T14:13:02", "step": 20, "loss": 7.964773654937744, "lr": 0.00010983357966978745, "tok_per_sec": 2517.705047649911, "grad_norm": 1.00663423538208, "tokens_seen": 40960, "elapsed_sec": 0.8134392080246471, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
25
+ {"event": "eval", "timestamp": "2026-03-13T14:13:02", "step": 20, "val_loss": 8.758275032043457, "perplexity": 6363.125917448135, "eval_batches": 2}
26
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:13:17", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
27
+ {"event": "run_finished", "timestamp": "2026-03-13T14:13:17", "final_step": 20, "tokens_seen": 40960}
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 14:12:24,605 | INFO | Pretraining started
2
+ 2026-03-13 14:12:24,605 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log
3
+ 2026-03-13 14:12:24,605 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl
4
+ 2026-03-13 14:12:24,605 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
5
+ 2026-03-13 14:12:24,605 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 14:12:24,605 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
7
+ 2026-03-13 14:12:27,439 | INFO | Device summary | device=mps precision=fp32 compile_model=False
8
+ 2026-03-13 14:12:27,440 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 14:12:27,440 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
10
+ 2026-03-13 14:12:27,440 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
11
+ 2026-03-13 14:12:28,743 | INFO | Train step | step=1 loss=10.8489 lr=0.000200 tok_per_sec=1,572 grad_norm=5.5737 tokens_seen=2.05K
12
+ 2026-03-13 14:12:29,792 | INFO | Train step | step=2 loss=10.7631 lr=0.000400 tok_per_sec=1,953 grad_norm=8.3225 tokens_seen=4.10K
13
+ 2026-03-13 14:12:30,538 | INFO | Train step | step=3 loss=10.3567 lr=0.000600 tok_per_sec=2,747 grad_norm=2.6284 tokens_seen=6.14K
14
+ 2026-03-13 14:12:31,280 | INFO | Train step | step=4 loss=10.3768 lr=0.000800 tok_per_sec=2,766 grad_norm=2.2171 tokens_seen=8.19K
15
+ 2026-03-13 14:12:32,020 | INFO | Train step | step=5 loss=10.2315 lr=0.001000 tok_per_sec=2,769 grad_norm=12.6454 tokens_seen=10.24K
16
+ 2026-03-13 14:12:32,760 | INFO | Train step | step=6 loss=9.9389 lr=0.001000 tok_per_sec=2,768 grad_norm=1.7283 tokens_seen=12.29K
17
+ 2026-03-13 14:12:33,505 | INFO | Train step | step=7 loss=9.6686 lr=0.000990 tok_per_sec=2,751 grad_norm=1.9499 tokens_seen=14.34K
18
+ 2026-03-13 14:12:34,247 | INFO | Train step | step=8 loss=9.3355 lr=0.000961 tok_per_sec=2,761 grad_norm=1.7211 tokens_seen=16.38K
19
+ 2026-03-13 14:12:34,986 | INFO | Train step | step=9 loss=9.2675 lr=0.000914 tok_per_sec=2,775 grad_norm=1.7628 tokens_seen=18.43K
20
+ 2026-03-13 14:12:35,733 | INFO | Train step | step=10 loss=8.8337 lr=0.000851 tok_per_sec=2,743 grad_norm=1.7004 tokens_seen=20.48K
21
+ 2026-03-13 14:12:35,903 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.38
22
+ 2026-03-13 14:12:53,990 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
23
+ 2026-03-13 14:12:55,016 | INFO | Train step | step=11 loss=8.7959 lr=0.000775 tok_per_sec=106 grad_norm=1.4900 tokens_seen=22.53K
24
+ 2026-03-13 14:12:55,798 | INFO | Train step | step=12 loss=8.5586 lr=0.000689 tok_per_sec=2,617 grad_norm=1.5880 tokens_seen=24.58K
25
+ 2026-03-13 14:12:56,593 | INFO | Train step | step=13 loss=8.5951 lr=0.000597 tok_per_sec=2,578 grad_norm=1.3136 tokens_seen=26.62K
26
+ 2026-03-13 14:12:57,366 | INFO | Train step | step=14 loss=8.3411 lr=0.000503 tok_per_sec=2,653 grad_norm=1.2977 tokens_seen=28.67K
27
+ 2026-03-13 14:12:58,126 | INFO | Train step | step=15 loss=8.2680 lr=0.000411 tok_per_sec=2,697 grad_norm=1.1973 tokens_seen=30.72K
28
+ 2026-03-13 14:12:58,921 | INFO | Train step | step=16 loss=8.3024 lr=0.000325 tok_per_sec=2,578 grad_norm=1.1259 tokens_seen=32.77K
29
+ 2026-03-13 14:12:59,857 | INFO | Train step | step=17 loss=8.6144 lr=0.000249 tok_per_sec=2,191 grad_norm=0.9171 tokens_seen=34.82K
30
+ 2026-03-13 14:13:00,660 | INFO | Train step | step=18 loss=8.4399 lr=0.000186 tok_per_sec=2,557 grad_norm=1.6393 tokens_seen=36.86K
31
+ 2026-03-13 14:13:01,483 | INFO | Train step | step=19 loss=7.9153 lr=0.000139 tok_per_sec=2,489 grad_norm=1.1117 tokens_seen=38.91K
32
+ 2026-03-13 14:13:02,297 | INFO | Train step | step=20 loss=7.9648 lr=0.000110 tok_per_sec=2,518 grad_norm=1.0066 tokens_seen=40.96K
33
+ 2026-03-13 14:13:02,479 | INFO | Eval step | step=20 val_loss=8.7583 perplexity=6363.13
34
+ 2026-03-13 14:13:17,338 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"event": "run_started", "timestamp": "2026-03-13T14:23:31", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
2
+ {"event": "runtime_summary", "timestamp": "2026-03-13T14:23:34", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
3
+ {"event": "train", "timestamp": "2026-03-13T14:23:36", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 1528.4547833159693, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.3399153330246918, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
4
+ {"event": "train", "timestamp": "2026-03-13T14:23:37", "step": 2, "loss": 10.763139486312866, "lr": 0.0004, "tok_per_sec": 1823.3959146519999, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 1.1231789999874309, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
5
+ {"event": "train", "timestamp": "2026-03-13T14:23:37", "step": 3, "loss": 10.356749296188354, "lr": 0.0006000000000000001, "tok_per_sec": 2760.206551850419, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7419734579743817, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
6
+ {"event": "train", "timestamp": "2026-03-13T14:23:38", "step": 4, "loss": 10.376826524734497, "lr": 0.0008, "tok_per_sec": 2758.5098402671138, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7424298329860903, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
7
+ {"event": "train", "timestamp": "2026-03-13T14:23:39", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2743.529400521807, "grad_norm": 12.645355224609375, "tokens_seen": 10240, "elapsed_sec": 0.7464837080333382, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
8
+ {"event": "train", "timestamp": "2026-03-13T14:23:40", "step": 6, "loss": 9.938905715942383, "lr": 0.001, "tok_per_sec": 2787.348798075642, "grad_norm": 1.7282867431640625, "tokens_seen": 12288, "elapsed_sec": 0.7347483750199899, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
9
+ {"event": "train", "timestamp": "2026-03-13T14:23:40", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2768.0064610428144, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7398826660355553, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
10
+ {"event": "train", "timestamp": "2026-03-13T14:23:41", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2765.3453249466884, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7405946669750847, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
11
+ {"event": "train", "timestamp": "2026-03-13T14:23:42", "step": 9, "loss": 9.267512559890747, "lr": 0.0009140576474687263, "tok_per_sec": 2776.3259680860633, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.7376655419939198, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
12
+ {"event": "train", "timestamp": "2026-03-13T14:23:43", "step": 10, "loss": 8.833673000335693, "lr": 0.0008511087728614862, "tok_per_sec": 2775.777209987709, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.7378113750019111, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
13
+ {"event": "eval", "timestamp": "2026-03-13T14:23:43", "step": 10, "val_loss": 9.096094131469727, "perplexity": 8920.382728799992, "eval_batches": 2}
14
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:24:04", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
15
+ {"event": "train", "timestamp": "2026-03-13T14:24:04", "step": 11, "loss": 8.795855522155762, "lr": 0.0007750000000000001, "tok_per_sec": 93.63913751097624, "grad_norm": 1.4899623394012451, "tokens_seen": 22528, "elapsed_sec": 21.871196749969386, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
16
+ {"event": "train", "timestamp": "2026-03-13T14:24:05", "step": 12, "loss": 8.558577060699463, "lr": 0.0006890576474687264, "tok_per_sec": 2713.0807773384895, "grad_norm": 1.5879555940628052, "tokens_seen": 24576, "elapsed_sec": 0.7548614169936627, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
17
+ {"event": "train", "timestamp": "2026-03-13T14:24:06", "step": 13, "loss": 8.595118045806885, "lr": 0.0005970378084704442, "tok_per_sec": 2623.969926754092, "grad_norm": 1.3136155605316162, "tokens_seen": 26624, "elapsed_sec": 0.7804967500269413, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
18
+ {"event": "train", "timestamp": "2026-03-13T14:24:07", "step": 14, "loss": 8.341074705123901, "lr": 0.000502962191529556, "tok_per_sec": 2640.2210539544203, "grad_norm": 1.2977045774459839, "tokens_seen": 28672, "elapsed_sec": 0.7756926250294782, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
19
+ {"event": "train", "timestamp": "2026-03-13T14:24:08", "step": 15, "loss": 8.268006086349487, "lr": 0.0004109423525312737, "tok_per_sec": 2716.846221941928, "grad_norm": 1.1973044872283936, "tokens_seen": 30720, "elapsed_sec": 0.7538152080378495, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
20
+ {"event": "train", "timestamp": "2026-03-13T14:24:08", "step": 16, "loss": 8.302424192428589, "lr": 0.0003250000000000001, "tok_per_sec": 2732.9290763944346, "grad_norm": 1.1259286403656006, "tokens_seen": 32768, "elapsed_sec": 0.7493791250162758, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
21
+ {"event": "train", "timestamp": "2026-03-13T14:24:09", "step": 17, "loss": 8.6144118309021, "lr": 0.00024889122713851394, "tok_per_sec": 2724.595059218572, "grad_norm": 0.9170812368392944, "tokens_seen": 34816, "elapsed_sec": 0.7516713329823688, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
22
+ {"event": "train", "timestamp": "2026-03-13T14:24:10", "step": 18, "loss": 8.439870119094849, "lr": 0.00018594235253127368, "tok_per_sec": 2735.215026577713, "grad_norm": 1.6393400430679321, "tokens_seen": 36864, "elapsed_sec": 0.7487528329947963, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
23
+ {"event": "train", "timestamp": "2026-03-13T14:24:11", "step": 19, "loss": 7.915311813354492, "lr": 0.00013890454406082956, "tok_per_sec": 2730.9775428733506, "grad_norm": 1.111694097518921, "tokens_seen": 38912, "elapsed_sec": 0.7499146250193007, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
24
+ {"event": "train", "timestamp": "2026-03-13T14:24:11", "step": 20, "loss": 7.964773654937744, "lr": 0.00010983357966978745, "tok_per_sec": 2738.180386915235, "grad_norm": 1.00663423538208, "tokens_seen": 40960, "elapsed_sec": 0.7479419580195099, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
25
+ {"event": "eval", "timestamp": "2026-03-13T14:24:11", "step": 20, "val_loss": 8.758275032043457, "perplexity": 6363.125917448135, "eval_batches": 2}
26
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:24:24", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
27
+ {"event": "run_finished", "timestamp": "2026-03-13T14:24:24", "final_step": 20, "tokens_seen": 40960}
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 14:23:31,892 | INFO | Pretraining started
2
+ 2026-03-13 14:23:31,892 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log
3
+ 2026-03-13 14:23:31,892 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl
4
+ 2026-03-13 14:23:31,892 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
5
+ 2026-03-13 14:23:31,892 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 14:23:31,892 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
7
+ 2026-03-13 14:23:34,726 | INFO | Device summary | device=mps precision=fp32 compile_model=False
8
+ 2026-03-13 14:23:34,727 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 14:23:34,727 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
10
+ 2026-03-13 14:23:34,727 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
11
+ 2026-03-13 14:23:36,068 | INFO | Train step | step=1 loss=10.8489 lr=0.000200 tok_per_sec=1,528 grad_norm=5.5737 tokens_seen=2.05K
12
+ 2026-03-13 14:23:37,192 | INFO | Train step | step=2 loss=10.7631 lr=0.000400 tok_per_sec=1,823 grad_norm=8.3225 tokens_seen=4.10K
13
+ 2026-03-13 14:23:37,934 | INFO | Train step | step=3 loss=10.3567 lr=0.000600 tok_per_sec=2,760 grad_norm=2.6284 tokens_seen=6.14K
14
+ 2026-03-13 14:23:38,678 | INFO | Train step | step=4 loss=10.3768 lr=0.000800 tok_per_sec=2,759 grad_norm=2.2171 tokens_seen=8.19K
15
+ 2026-03-13 14:23:39,425 | INFO | Train step | step=5 loss=10.2315 lr=0.001000 tok_per_sec=2,744 grad_norm=12.6454 tokens_seen=10.24K
16
+ 2026-03-13 14:23:40,160 | INFO | Train step | step=6 loss=9.9389 lr=0.001000 tok_per_sec=2,787 grad_norm=1.7283 tokens_seen=12.29K
17
+ 2026-03-13 14:23:40,900 | INFO | Train step | step=7 loss=9.6686 lr=0.000990 tok_per_sec=2,768 grad_norm=1.9499 tokens_seen=14.34K
18
+ 2026-03-13 14:23:41,641 | INFO | Train step | step=8 loss=9.3355 lr=0.000961 tok_per_sec=2,765 grad_norm=1.7211 tokens_seen=16.38K
19
+ 2026-03-13 14:23:42,380 | INFO | Train step | step=9 loss=9.2675 lr=0.000914 tok_per_sec=2,776 grad_norm=1.7628 tokens_seen=18.43K
20
+ 2026-03-13 14:23:43,118 | INFO | Train step | step=10 loss=8.8337 lr=0.000851 tok_per_sec=2,776 grad_norm=1.7004 tokens_seen=20.48K
21
+ 2026-03-13 14:23:43,296 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.38
22
+ 2026-03-13 14:24:04,120 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
23
+ 2026-03-13 14:24:04,990 | INFO | Train step | step=11 loss=8.7959 lr=0.000775 tok_per_sec=94 grad_norm=1.4900 tokens_seen=22.53K
24
+ 2026-03-13 14:24:05,745 | INFO | Train step | step=12 loss=8.5586 lr=0.000689 tok_per_sec=2,713 grad_norm=1.5880 tokens_seen=24.58K
25
+ 2026-03-13 14:24:06,526 | INFO | Train step | step=13 loss=8.5951 lr=0.000597 tok_per_sec=2,624 grad_norm=1.3136 tokens_seen=26.62K
26
+ 2026-03-13 14:24:07,302 | INFO | Train step | step=14 loss=8.3411 lr=0.000503 tok_per_sec=2,640 grad_norm=1.2977 tokens_seen=28.67K
27
+ 2026-03-13 14:24:08,057 | INFO | Train step | step=15 loss=8.2680 lr=0.000411 tok_per_sec=2,717 grad_norm=1.1973 tokens_seen=30.72K
28
+ 2026-03-13 14:24:08,806 | INFO | Train step | step=16 loss=8.3024 lr=0.000325 tok_per_sec=2,733 grad_norm=1.1259 tokens_seen=32.77K
29
+ 2026-03-13 14:24:09,559 | INFO | Train step | step=17 loss=8.6144 lr=0.000249 tok_per_sec=2,725 grad_norm=0.9171 tokens_seen=34.82K
30
+ 2026-03-13 14:24:10,308 | INFO | Train step | step=18 loss=8.4399 lr=0.000186 tok_per_sec=2,735 grad_norm=1.6393 tokens_seen=36.86K
31
+ 2026-03-13 14:24:11,058 | INFO | Train step | step=19 loss=7.9153 lr=0.000139 tok_per_sec=2,731 grad_norm=1.1117 tokens_seen=38.91K
32
+ 2026-03-13 14:24:11,807 | INFO | Train step | step=20 loss=7.9648 lr=0.000110 tok_per_sec=2,738 grad_norm=1.0066 tokens_seen=40.96K
33
+ 2026-03-13 14:24:11,966 | INFO | Eval step | step=20 val_loss=8.7583 perplexity=6363.13
34
+ 2026-03-13 14:24:24,399 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"event": "run_started", "timestamp": "2026-03-13T14:25:30", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
2
+ {"event": "runtime_summary", "timestamp": "2026-03-13T14:25:33", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
3
+ {"event": "train", "timestamp": "2026-03-13T14:25:35", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 1738.338776128348, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.1781362920301035, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
4
+ {"event": "train", "timestamp": "2026-03-13T14:25:35", "step": 2, "loss": 10.763139486312866, "lr": 0.0004, "tok_per_sec": 2202.432846935467, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 0.9298807919840328, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
5
+ {"event": "train", "timestamp": "2026-03-13T14:25:36", "step": 3, "loss": 10.356749296188354, "lr": 0.0006000000000000001, "tok_per_sec": 2756.6912099758943, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7429196250159293, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
6
+ {"event": "train", "timestamp": "2026-03-13T14:25:37", "step": 4, "loss": 10.376826524734497, "lr": 0.0008, "tok_per_sec": 2738.8579811402797, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7477569169714116, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
7
+ {"event": "train", "timestamp": "2026-03-13T14:25:38", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2710.2101486388783, "grad_norm": 12.645347595214844, "tokens_seen": 10240, "elapsed_sec": 0.755660958995577, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
8
+ {"event": "train", "timestamp": "2026-03-13T14:25:39", "step": 6, "loss": 9.938905477523804, "lr": 0.001, "tok_per_sec": 2643.8368942648644, "grad_norm": 1.7282862663269043, "tokens_seen": 12288, "elapsed_sec": 0.7746317499550059, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
9
+ {"event": "train", "timestamp": "2026-03-13T14:25:39", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2696.598737188916, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7594752499717288, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
10
+ {"event": "train", "timestamp": "2026-03-13T14:25:40", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2697.2926156977223, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7592798749683425, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
11
+ {"event": "train", "timestamp": "2026-03-13T14:25:41", "step": 9, "loss": 9.267513036727905, "lr": 0.0009140576474687263, "tok_per_sec": 2700.6039931851096, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.7583488749805838, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
12
+ {"event": "train", "timestamp": "2026-03-13T14:25:42", "step": 10, "loss": 8.833672761917114, "lr": 0.0008511087728614862, "tok_per_sec": 2564.5051143205906, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.7985946249682456, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
13
+ {"event": "eval", "timestamp": "2026-03-13T14:25:42", "step": 10, "val_loss": 9.096094131469727, "perplexity": 8920.382728799992, "eval_batches": 2}
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 14:25:30,936 | INFO | Pretraining started
2
+ 2026-03-13 14:25:30,936 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log
3
+ 2026-03-13 14:25:30,936 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl
4
+ 2026-03-13 14:25:30,936 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
5
+ 2026-03-13 14:25:30,936 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 14:25:30,937 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
7
+ 2026-03-13 14:25:33,870 | INFO | Device summary | device=mps precision=fp32 compile_model=False
8
+ 2026-03-13 14:25:33,870 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 14:25:33,871 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
10
+ 2026-03-13 14:25:33,871 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
11
+ 2026-03-13 14:25:35,050 | INFO | Train step | step=1 loss=10.8489 lr=0.000200 tok_per_sec=1,738 grad_norm=5.5737 tokens_seen=2.05K
12
+ 2026-03-13 14:25:35,980 | INFO | Train step | step=2 loss=10.7631 lr=0.000400 tok_per_sec=2,202 grad_norm=8.3225 tokens_seen=4.10K
13
+ 2026-03-13 14:25:36,724 | INFO | Train step | step=3 loss=10.3567 lr=0.000600 tok_per_sec=2,757 grad_norm=2.6284 tokens_seen=6.14K
14
+ 2026-03-13 14:25:37,472 | INFO | Train step | step=4 loss=10.3768 lr=0.000800 tok_per_sec=2,739 grad_norm=2.2171 tokens_seen=8.19K
15
+ 2026-03-13 14:25:38,228 | INFO | Train step | step=5 loss=10.2315 lr=0.001000 tok_per_sec=2,710 grad_norm=12.6453 tokens_seen=10.24K
16
+ 2026-03-13 14:25:39,004 | INFO | Train step | step=6 loss=9.9389 lr=0.001000 tok_per_sec=2,644 grad_norm=1.7283 tokens_seen=12.29K
17
+ 2026-03-13 14:25:39,764 | INFO | Train step | step=7 loss=9.6686 lr=0.000990 tok_per_sec=2,697 grad_norm=1.9499 tokens_seen=14.34K
18
+ 2026-03-13 14:25:40,524 | INFO | Train step | step=8 loss=9.3355 lr=0.000961 tok_per_sec=2,697 grad_norm=1.7211 tokens_seen=16.38K
19
+ 2026-03-13 14:25:41,283 | INFO | Train step | step=9 loss=9.2675 lr=0.000914 tok_per_sec=2,701 grad_norm=1.7628 tokens_seen=18.43K
20
+ 2026-03-13 14:25:42,082 | INFO | Train step | step=10 loss=8.8337 lr=0.000851 tok_per_sec=2,565 grad_norm=1.7004 tokens_seen=20.48K
21
+ 2026-03-13 14:25:42,254 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.38
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"event": "run_started", "timestamp": "2026-03-13T14:25:59", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 500, "warmup_steps": 50, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
2
+ {"event": "runtime_summary", "timestamp": "2026-03-13T14:26:03", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
3
+ {"event": "train", "timestamp": "2026-03-13T14:26:04", "step": 1, "loss": 10.848917245864868, "lr": 2e-05, "tok_per_sec": 1704.8695679494026, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.2012649169773795, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
4
+ {"event": "train", "timestamp": "2026-03-13T14:26:05", "step": 2, "loss": 10.897652626037598, "lr": 4e-05, "tok_per_sec": 2132.5760089557198, "grad_norm": 5.0279011726379395, "tokens_seen": 4096, "elapsed_sec": 0.9603409169940278, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
5
+ {"event": "train", "timestamp": "2026-03-13T14:26:05", "step": 3, "loss": 10.785077571868896, "lr": 6e-05, "tok_per_sec": 2721.481065658625, "grad_norm": 5.114167213439941, "tokens_seen": 6144, "elapsed_sec": 0.7525314160156995, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
6
+ {"event": "train", "timestamp": "2026-03-13T14:26:06", "step": 4, "loss": 10.634832620620728, "lr": 8e-05, "tok_per_sec": 2755.0287727887576, "grad_norm": 6.422860622406006, "tokens_seen": 8192, "elapsed_sec": 0.7433679169625975, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
7
+ {"event": "train", "timestamp": "2026-03-13T14:26:07", "step": 5, "loss": 10.747493743896484, "lr": 0.0001, "tok_per_sec": 2752.2115918295694, "grad_norm": 6.580272197723389, "tokens_seen": 10240, "elapsed_sec": 0.7441288330010138, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
8
+ {"event": "train", "timestamp": "2026-03-13T14:26:08", "step": 6, "loss": 10.633646488189697, "lr": 0.00012, "tok_per_sec": 2738.0719353969716, "grad_norm": 6.525123119354248, "tokens_seen": 12288, "elapsed_sec": 0.7479715830413625, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
9
+ {"event": "train", "timestamp": "2026-03-13T14:26:08", "step": 7, "loss": 10.477944374084473, "lr": 0.00014000000000000001, "tok_per_sec": 2749.743315053932, "grad_norm": 5.189582824707031, "tokens_seen": 14336, "elapsed_sec": 0.7447967920452356, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
10
+ {"event": "train", "timestamp": "2026-03-13T14:26:09", "step": 8, "loss": 10.35365605354309, "lr": 0.00016, "tok_per_sec": 2753.8837425710026, "grad_norm": 2.357203960418701, "tokens_seen": 16384, "elapsed_sec": 0.743676999991294, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
11
+ {"event": "train", "timestamp": "2026-03-13T14:26:10", "step": 9, "loss": 10.339627742767334, "lr": 0.00018, "tok_per_sec": 2752.9484182072647, "grad_norm": 4.98753547668457, "tokens_seen": 18432, "elapsed_sec": 0.7439296669908799, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
12
+ {"event": "train", "timestamp": "2026-03-13T14:26:11", "step": 10, "loss": 10.207262754440308, "lr": 0.0002, "tok_per_sec": 2755.2380338330304, "grad_norm": 5.554019927978516, "tokens_seen": 20480, "elapsed_sec": 0.7433114579762332, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
13
+ {"event": "eval", "timestamp": "2026-03-13T14:26:11", "step": 10, "val_loss": 10.418237686157227, "perplexity": 33464.40736092908, "eval_batches": 2}
14
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:26:27", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
15
+ {"event": "train", "timestamp": "2026-03-13T14:26:28", "step": 11, "loss": 10.161789417266846, "lr": 0.00021999999999999998, "tok_per_sec": 119.14029052794488, "grad_norm": 2.177887201309204, "tokens_seen": 22528, "elapsed_sec": 17.189818750019185, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
16
+ {"event": "train", "timestamp": "2026-03-13T14:26:29", "step": 12, "loss": 10.047882318496704, "lr": 0.00024, "tok_per_sec": 2680.171783579867, "grad_norm": 1.9737660884857178, "tokens_seen": 24576, "elapsed_sec": 0.7641301249968819, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
17
+ {"event": "train", "timestamp": "2026-03-13T14:26:29", "step": 13, "loss": 10.105542421340942, "lr": 0.00026000000000000003, "tok_per_sec": 2702.3713362104313, "grad_norm": 4.096495151519775, "tokens_seen": 26624, "elapsed_sec": 0.7578529170132242, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
18
+ {"event": "train", "timestamp": "2026-03-13T14:26:30", "step": 14, "loss": 9.940982818603516, "lr": 0.00028000000000000003, "tok_per_sec": 2722.8033630518785, "grad_norm": 1.8798285722732544, "tokens_seen": 28672, "elapsed_sec": 0.7521659579942934, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
19
+ {"event": "train", "timestamp": "2026-03-13T14:26:31", "step": 15, "loss": 9.852107524871826, "lr": 0.0003, "tok_per_sec": 2738.483668417333, "grad_norm": 3.7492053508758545, "tokens_seen": 30720, "elapsed_sec": 0.7478591249673627, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
20
+ {"event": "train", "timestamp": "2026-03-13T14:26:32", "step": 16, "loss": 9.768622398376465, "lr": 0.00032, "tok_per_sec": 2756.126999840051, "grad_norm": 1.8649290800094604, "tokens_seen": 32768, "elapsed_sec": 0.7430717090028338, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
21
+ {"event": "train", "timestamp": "2026-03-13T14:26:32", "step": 17, "loss": 9.880046606063843, "lr": 0.00034, "tok_per_sec": 2762.6129070979177, "grad_norm": 1.8526010513305664, "tokens_seen": 34816, "elapsed_sec": 0.7413271670229733, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
22
+ {"event": "train", "timestamp": "2026-03-13T14:26:33", "step": 18, "loss": 9.652766704559326, "lr": 0.00036, "tok_per_sec": 2762.0740561225734, "grad_norm": 2.2104318141937256, "tokens_seen": 36864, "elapsed_sec": 0.74147179198917, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
23
+ {"event": "train", "timestamp": "2026-03-13T14:26:34", "step": 19, "loss": 9.371065139770508, "lr": 0.00037999999999999997, "tok_per_sec": 2746.3599790187623, "grad_norm": 2.0031697750091553, "tokens_seen": 38912, "elapsed_sec": 0.7457143330248073, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
24
+ {"event": "train", "timestamp": "2026-03-13T14:26:35", "step": 20, "loss": 9.290096044540405, "lr": 0.0004, "tok_per_sec": 2753.183730427524, "grad_norm": 1.9113200902938843, "tokens_seen": 40960, "elapsed_sec": 0.7438660839688964, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
25
+ {"event": "eval", "timestamp": "2026-03-13T14:26:35", "step": 20, "val_loss": 9.618017196655273, "perplexity": 15033.212463302863, "eval_batches": 2}
26
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:26:47", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
27
+ {"event": "train", "timestamp": "2026-03-13T14:26:48", "step": 21, "loss": 9.240976810455322, "lr": 0.00042, "tok_per_sec": 152.54166876251324, "grad_norm": 1.8867971897125244, "tokens_seen": 43008, "elapsed_sec": 13.425839749979787, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
28
+ {"event": "train", "timestamp": "2026-03-13T14:26:49", "step": 22, "loss": 9.12671947479248, "lr": 0.00043999999999999996, "tok_per_sec": 2712.778906376721, "grad_norm": 1.8414427042007446, "tokens_seen": 45056, "elapsed_sec": 0.7549454160034657, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
29
+ {"event": "train", "timestamp": "2026-03-13T14:26:50", "step": 23, "loss": 9.102352619171143, "lr": 0.00046, "tok_per_sec": 2708.941555033957, "grad_norm": 1.6314030885696411, "tokens_seen": 47104, "elapsed_sec": 0.7560148339834996, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
30
+ {"event": "train", "timestamp": "2026-03-13T14:26:50", "step": 24, "loss": 8.811159133911133, "lr": 0.00048, "tok_per_sec": 2721.638538039314, "grad_norm": 1.8162541389465332, "tokens_seen": 49152, "elapsed_sec": 0.7524878749973141, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
31
+ {"event": "train", "timestamp": "2026-03-13T14:26:51", "step": 25, "loss": 8.63177752494812, "lr": 0.0005, "tok_per_sec": 2726.8041484896708, "grad_norm": 1.7629377841949463, "tokens_seen": 51200, "elapsed_sec": 0.7510623750276864, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
32
+ {"event": "train", "timestamp": "2026-03-13T14:26:52", "step": 26, "loss": 8.722702026367188, "lr": 0.0005200000000000001, "tok_per_sec": 2722.9481650192897, "grad_norm": 1.66167414188385, "tokens_seen": 53248, "elapsed_sec": 0.7521259590284899, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
33
+ {"event": "train", "timestamp": "2026-03-13T14:26:53", "step": 27, "loss": 8.609044075012207, "lr": 0.00054, "tok_per_sec": 2753.0232007676027, "grad_norm": 1.4688063859939575, "tokens_seen": 55296, "elapsed_sec": 0.7439094590372406, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
34
+ {"event": "train", "timestamp": "2026-03-13T14:26:53", "step": 28, "loss": 8.890318632125854, "lr": 0.0005600000000000001, "tok_per_sec": 2748.212475197479, "grad_norm": 1.3299572467803955, "tokens_seen": 57344, "elapsed_sec": 0.745211667031981, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
35
+ {"event": "train", "timestamp": "2026-03-13T14:26:54", "step": 29, "loss": 8.266993045806885, "lr": 0.00058, "tok_per_sec": 2729.828001611049, "grad_norm": 1.4132530689239502, "tokens_seen": 59392, "elapsed_sec": 0.7502304170047864, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
36
+ {"event": "train", "timestamp": "2026-03-13T14:26:55", "step": 30, "loss": 8.5481858253479, "lr": 0.0006, "tok_per_sec": 2732.970411338659, "grad_norm": 1.6723191738128662, "tokens_seen": 61440, "elapsed_sec": 0.7493677909951657, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
37
+ {"event": "eval", "timestamp": "2026-03-13T14:26:55", "step": 30, "val_loss": 8.843989849090576, "perplexity": 6932.597552057813, "eval_batches": 2}
38
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:27:06", "step": 30, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000030.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 61440}
39
+ {"event": "train", "timestamp": "2026-03-13T14:27:07", "step": 31, "loss": 8.099784016609192, "lr": 0.00062, "tok_per_sec": 168.2314308371399, "grad_norm": 1.3632475137710571, "tokens_seen": 63488, "elapsed_sec": 12.173706125002354, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
40
+ {"event": "train", "timestamp": "2026-03-13T14:27:08", "step": 32, "loss": 8.278108835220337, "lr": 0.00064, "tok_per_sec": 2688.7908394379037, "grad_norm": 1.1354058980941772, "tokens_seen": 65536, "elapsed_sec": 0.7616806669975631, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
41
+ {"event": "train", "timestamp": "2026-03-13T14:27:08", "step": 33, "loss": 8.04857063293457, "lr": 0.00066, "tok_per_sec": 2695.7795378460673, "grad_norm": 0.8978219032287598, "tokens_seen": 67584, "elapsed_sec": 0.7597060409607366, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
42
+ {"event": "train", "timestamp": "2026-03-13T14:27:09", "step": 34, "loss": 8.70958948135376, "lr": 0.00068, "tok_per_sec": 2707.39823225129, "grad_norm": 1.751259446144104, "tokens_seen": 69632, "elapsed_sec": 0.756445791979786, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
43
+ {"event": "train", "timestamp": "2026-03-13T14:27:10", "step": 35, "loss": 8.077706575393677, "lr": 0.0007000000000000001, "tok_per_sec": 2689.526034181471, "grad_norm": 0.9328188896179199, "tokens_seen": 71680, "elapsed_sec": 0.761472457961645, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
44
+ {"event": "train", "timestamp": "2026-03-13T14:27:11", "step": 36, "loss": 8.057007431983948, "lr": 0.00072, "tok_per_sec": 2729.9771946058904, "grad_norm": 0.7004730701446533, "tokens_seen": 73728, "elapsed_sec": 0.7501894169836305, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
45
+ {"event": "train", "timestamp": "2026-03-13T14:27:11", "step": 37, "loss": 8.280940413475037, "lr": 0.00074, "tok_per_sec": 2712.4312913681806, "grad_norm": 0.8498008251190186, "tokens_seen": 75776, "elapsed_sec": 0.7550421669729985, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
46
+ {"event": "train", "timestamp": "2026-03-13T14:27:12", "step": 38, "loss": 8.420085191726685, "lr": 0.0007599999999999999, "tok_per_sec": 2728.334548153266, "grad_norm": 0.9405263662338257, "tokens_seen": 77824, "elapsed_sec": 0.7506410829955712, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
47
+ {"event": "train", "timestamp": "2026-03-13T14:27:13", "step": 39, "loss": 8.040002822875977, "lr": 0.00078, "tok_per_sec": 2734.281540014069, "grad_norm": 0.8642140030860901, "tokens_seen": 79872, "elapsed_sec": 0.7490084579912946, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
48
+ {"event": "train", "timestamp": "2026-03-13T14:27:14", "step": 40, "loss": 8.193370580673218, "lr": 0.0008, "tok_per_sec": 2746.819492491367, "grad_norm": 0.9126524329185486, "tokens_seen": 81920, "elapsed_sec": 0.7455895830062218, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
49
+ {"event": "eval", "timestamp": "2026-03-13T14:27:14", "step": 40, "val_loss": 8.36504077911377, "perplexity": 4294.2868516794715, "eval_batches": 2}
50
+ {"event": "checkpoint", "timestamp": "2026-03-13T14:27:26", "step": 40, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000040.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 81920}
51
+ {"event": "train", "timestamp": "2026-03-13T14:27:27", "step": 41, "loss": 7.95119035243988, "lr": 0.00082, "tok_per_sec": 156.35387795373205, "grad_norm": 1.0787891149520874, "tokens_seen": 83968, "elapsed_sec": 13.098491874989122, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
52
+ {"event": "train", "timestamp": "2026-03-13T14:27:28", "step": 42, "loss": 7.986739635467529, "lr": 0.00084, "tok_per_sec": 2722.8155804265352, "grad_norm": 0.9517979621887207, "tokens_seen": 86016, "elapsed_sec": 0.7521625829976983, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
53
+ {"event": "train", "timestamp": "2026-03-13T14:27:28", "step": 43, "loss": 7.984379172325134, "lr": 0.0008600000000000001, "tok_per_sec": 2734.609061278845, "grad_norm": 0.9767814874649048, "tokens_seen": 88064, "elapsed_sec": 0.7489187500323169, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
54
+ {"event": "train", "timestamp": "2026-03-13T14:27:29", "step": 44, "loss": 7.784951090812683, "lr": 0.0008799999999999999, "tok_per_sec": 2551.6733064235736, "grad_norm": 0.9357463717460632, "tokens_seen": 90112, "elapsed_sec": 0.8026105829630978, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
55
+ {"event": "train", "timestamp": "2026-03-13T14:27:30", "step": 45, "loss": 8.117401361465454, "lr": 0.0009, "tok_per_sec": 2731.190447061073, "grad_norm": 0.7716737985610962, "tokens_seen": 92160, "elapsed_sec": 0.7498561670072377, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
56
+ {"event": "train", "timestamp": "2026-03-13T14:27:31", "step": 46, "loss": 8.220598220825195, "lr": 0.00092, "tok_per_sec": 2729.7315806190586, "grad_norm": 0.7731218338012695, "tokens_seen": 94208, "elapsed_sec": 0.7502569170319475, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
57
+ {"event": "train", "timestamp": "2026-03-13T14:27:31", "step": 47, "loss": 8.069997072219849, "lr": 0.00094, "tok_per_sec": 2718.9204771868167, "grad_norm": 1.0135213136672974, "tokens_seen": 96256, "elapsed_sec": 0.7532401249627583, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
58
+ {"event": "train", "timestamp": "2026-03-13T14:27:32", "step": 48, "loss": 7.909337043762207, "lr": 0.00096, "tok_per_sec": 2717.9027900424403, "grad_norm": 1.088800072669983, "tokens_seen": 98304, "elapsed_sec": 0.7535221669822931, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
59
+ {"event": "train", "timestamp": "2026-03-13T14:27:33", "step": 49, "loss": 7.95028281211853, "lr": 0.00098, "tok_per_sec": 2721.736196494166, "grad_norm": 1.794154167175293, "tokens_seen": 100352, "elapsed_sec": 0.7524608750245534, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
60
+ {"event": "train", "timestamp": "2026-03-13T14:27:34", "step": 50, "loss": 8.580274820327759, "lr": 0.001, "tok_per_sec": 2714.6346388455563, "grad_norm": 1.2600723505020142, "tokens_seen": 102400, "elapsed_sec": 0.7544293330283836, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
61
+ {"event": "eval", "timestamp": "2026-03-13T14:27:34", "step": 50, "val_loss": 8.607748031616211, "perplexity": 5473.907720171149, "eval_batches": 2}
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 14:25:59,997 | INFO | Pretraining started
2
+ 2026-03-13 14:25:59,997 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log
3
+ 2026-03-13 14:25:59,997 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl
4
+ 2026-03-13 14:25:59,997 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
5
+ 2026-03-13 14:25:59,997 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 14:25:59,997 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 500, 'warmup_steps': 50, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
7
+ 2026-03-13 14:26:03,019 | INFO | Device summary | device=mps precision=fp32 compile_model=False
8
+ 2026-03-13 14:26:03,020 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 14:26:03,020 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
10
+ 2026-03-13 14:26:03,020 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
11
+ 2026-03-13 14:26:04,222 | INFO | Train step | step=1 loss=10.8489 lr=0.000020 tok_per_sec=1,705 grad_norm=5.5737 tokens_seen=2.05K
12
+ 2026-03-13 14:26:05,183 | INFO | Train step | step=2 loss=10.8977 lr=0.000040 tok_per_sec=2,133 grad_norm=5.0279 tokens_seen=4.10K
13
+ 2026-03-13 14:26:05,936 | INFO | Train step | step=3 loss=10.7851 lr=0.000060 tok_per_sec=2,721 grad_norm=5.1142 tokens_seen=6.14K
14
+ 2026-03-13 14:26:06,680 | INFO | Train step | step=4 loss=10.6348 lr=0.000080 tok_per_sec=2,755 grad_norm=6.4229 tokens_seen=8.19K
15
+ 2026-03-13 14:26:07,424 | INFO | Train step | step=5 loss=10.7475 lr=0.000100 tok_per_sec=2,752 grad_norm=6.5803 tokens_seen=10.24K
16
+ 2026-03-13 14:26:08,173 | INFO | Train step | step=6 loss=10.6336 lr=0.000120 tok_per_sec=2,738 grad_norm=6.5251 tokens_seen=12.29K
17
+ 2026-03-13 14:26:08,918 | INFO | Train step | step=7 loss=10.4779 lr=0.000140 tok_per_sec=2,750 grad_norm=5.1896 tokens_seen=14.34K
18
+ 2026-03-13 14:26:09,663 | INFO | Train step | step=8 loss=10.3537 lr=0.000160 tok_per_sec=2,754 grad_norm=2.3572 tokens_seen=16.38K
19
+ 2026-03-13 14:26:10,408 | INFO | Train step | step=9 loss=10.3396 lr=0.000180 tok_per_sec=2,753 grad_norm=4.9875 tokens_seen=18.43K
20
+ 2026-03-13 14:26:11,152 | INFO | Train step | step=10 loss=10.2073 lr=0.000200 tok_per_sec=2,755 grad_norm=5.5540 tokens_seen=20.48K
21
+ 2026-03-13 14:26:11,326 | INFO | Eval step | step=10 val_loss=10.4182 perplexity=33464.41
22
+ 2026-03-13 14:26:27,213 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
23
+ 2026-03-13 14:26:28,342 | INFO | Train step | step=11 loss=10.1618 lr=0.000220 tok_per_sec=119 grad_norm=2.1779 tokens_seen=22.53K
24
+ 2026-03-13 14:26:29,107 | INFO | Train step | step=12 loss=10.0479 lr=0.000240 tok_per_sec=2,680 grad_norm=1.9738 tokens_seen=24.58K
25
+ 2026-03-13 14:26:29,865 | INFO | Train step | step=13 loss=10.1055 lr=0.000260 tok_per_sec=2,702 grad_norm=4.0965 tokens_seen=26.62K
26
+ 2026-03-13 14:26:30,617 | INFO | Train step | step=14 loss=9.9410 lr=0.000280 tok_per_sec=2,723 grad_norm=1.8798 tokens_seen=28.67K
27
+ 2026-03-13 14:26:31,366 | INFO | Train step | step=15 loss=9.8521 lr=0.000300 tok_per_sec=2,738 grad_norm=3.7492 tokens_seen=30.72K
28
+ 2026-03-13 14:26:32,109 | INFO | Train step | step=16 loss=9.7686 lr=0.000320 tok_per_sec=2,756 grad_norm=1.8649 tokens_seen=32.77K
29
+ 2026-03-13 14:26:32,851 | INFO | Train step | step=17 loss=9.8800 lr=0.000340 tok_per_sec=2,763 grad_norm=1.8526 tokens_seen=34.82K
30
+ 2026-03-13 14:26:33,593 | INFO | Train step | step=18 loss=9.6528 lr=0.000360 tok_per_sec=2,762 grad_norm=2.2104 tokens_seen=36.86K
31
+ 2026-03-13 14:26:34,339 | INFO | Train step | step=19 loss=9.3711 lr=0.000380 tok_per_sec=2,746 grad_norm=2.0032 tokens_seen=38.91K
32
+ 2026-03-13 14:26:35,084 | INFO | Train step | step=20 loss=9.2901 lr=0.000400 tok_per_sec=2,753 grad_norm=1.9113 tokens_seen=40.96K
33
+ 2026-03-13 14:26:35,243 | INFO | Eval step | step=20 val_loss=9.6180 perplexity=15033.21
34
+ 2026-03-13 14:26:47,586 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
35
+ 2026-03-13 14:26:48,510 | INFO | Train step | step=21 loss=9.2410 lr=0.000420 tok_per_sec=153 grad_norm=1.8868 tokens_seen=43.01K
36
+ 2026-03-13 14:26:49,266 | INFO | Train step | step=22 loss=9.1267 lr=0.000440 tok_per_sec=2,713 grad_norm=1.8414 tokens_seen=45.06K
37
+ 2026-03-13 14:26:50,022 | INFO | Train step | step=23 loss=9.1024 lr=0.000460 tok_per_sec=2,709 grad_norm=1.6314 tokens_seen=47.10K
38
+ 2026-03-13 14:26:50,775 | INFO | Train step | step=24 loss=8.8112 lr=0.000480 tok_per_sec=2,722 grad_norm=1.8163 tokens_seen=49.15K
39
+ 2026-03-13 14:26:51,527 | INFO | Train step | step=25 loss=8.6318 lr=0.000500 tok_per_sec=2,727 grad_norm=1.7629 tokens_seen=51.20K
40
+ 2026-03-13 14:26:52,279 | INFO | Train step | step=26 loss=8.7227 lr=0.000520 tok_per_sec=2,723 grad_norm=1.6617 tokens_seen=53.25K
41
+ 2026-03-13 14:26:53,024 | INFO | Train step | step=27 loss=8.6090 lr=0.000540 tok_per_sec=2,753 grad_norm=1.4688 tokens_seen=55.30K
42
+ 2026-03-13 14:26:53,770 | INFO | Train step | step=28 loss=8.8903 lr=0.000560 tok_per_sec=2,748 grad_norm=1.3300 tokens_seen=57.34K
43
+ 2026-03-13 14:26:54,520 | INFO | Train step | step=29 loss=8.2670 lr=0.000580 tok_per_sec=2,730 grad_norm=1.4133 tokens_seen=59.39K
44
+ 2026-03-13 14:26:55,270 | INFO | Train step | step=30 loss=8.5482 lr=0.000600 tok_per_sec=2,733 grad_norm=1.6723 tokens_seen=61.44K
45
+ 2026-03-13 14:26:55,430 | INFO | Eval step | step=30 val_loss=8.8440 perplexity=6932.60
46
+ 2026-03-13 14:27:06,544 | INFO | Checkpoint saved | step=30 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000030.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
47
+ 2026-03-13 14:27:07,445 | INFO | Train step | step=31 loss=8.0998 lr=0.000620 tok_per_sec=168 grad_norm=1.3632 tokens_seen=63.49K
48
+ 2026-03-13 14:27:08,207 | INFO | Train step | step=32 loss=8.2781 lr=0.000640 tok_per_sec=2,689 grad_norm=1.1354 tokens_seen=65.54K
49
+ 2026-03-13 14:27:08,967 | INFO | Train step | step=33 loss=8.0486 lr=0.000660 tok_per_sec=2,696 grad_norm=0.8978 tokens_seen=67.58K
50
+ 2026-03-13 14:27:09,724 | INFO | Train step | step=34 loss=8.7096 lr=0.000680 tok_per_sec=2,707 grad_norm=1.7513 tokens_seen=69.63K
51
+ 2026-03-13 14:27:10,486 | INFO | Train step | step=35 loss=8.0777 lr=0.000700 tok_per_sec=2,690 grad_norm=0.9328 tokens_seen=71.68K
52
+ 2026-03-13 14:27:11,237 | INFO | Train step | step=36 loss=8.0570 lr=0.000720 tok_per_sec=2,730 grad_norm=0.7005 tokens_seen=73.73K
53
+ 2026-03-13 14:27:11,993 | INFO | Train step | step=37 loss=8.2809 lr=0.000740 tok_per_sec=2,712 grad_norm=0.8498 tokens_seen=75.78K
54
+ 2026-03-13 14:27:12,744 | INFO | Train step | step=38 loss=8.4201 lr=0.000760 tok_per_sec=2,728 grad_norm=0.9405 tokens_seen=77.82K
55
+ 2026-03-13 14:27:13,493 | INFO | Train step | step=39 loss=8.0400 lr=0.000780 tok_per_sec=2,734 grad_norm=0.8642 tokens_seen=79.87K
56
+ 2026-03-13 14:27:14,239 | INFO | Train step | step=40 loss=8.1934 lr=0.000800 tok_per_sec=2,747 grad_norm=0.9127 tokens_seen=81.92K
57
+ 2026-03-13 14:27:14,399 | INFO | Eval step | step=40 val_loss=8.3650 perplexity=4294.29
58
+ 2026-03-13 14:27:26,408 | INFO | Checkpoint saved | step=40 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000040.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
59
+ 2026-03-13 14:27:27,338 | INFO | Train step | step=41 loss=7.9512 lr=0.000820 tok_per_sec=156 grad_norm=1.0788 tokens_seen=83.97K
60
+ 2026-03-13 14:27:28,091 | INFO | Train step | step=42 loss=7.9867 lr=0.000840 tok_per_sec=2,723 grad_norm=0.9518 tokens_seen=86.02K
61
+ 2026-03-13 14:27:28,841 | INFO | Train step | step=43 loss=7.9844 lr=0.000860 tok_per_sec=2,735 grad_norm=0.9768 tokens_seen=88.06K
62
+ 2026-03-13 14:27:29,644 | INFO | Train step | step=44 loss=7.7850 lr=0.000880 tok_per_sec=2,552 grad_norm=0.9357 tokens_seen=90.11K
63
+ 2026-03-13 14:27:30,394 | INFO | Train step | step=45 loss=8.1174 lr=0.000900 tok_per_sec=2,731 grad_norm=0.7717 tokens_seen=92.16K
64
+ 2026-03-13 14:27:31,145 | INFO | Train step | step=46 loss=8.2206 lr=0.000920 tok_per_sec=2,730 grad_norm=0.7731 tokens_seen=94.21K
65
+ 2026-03-13 14:27:31,898 | INFO | Train step | step=47 loss=8.0700 lr=0.000940 tok_per_sec=2,719 grad_norm=1.0135 tokens_seen=96.26K
66
+ 2026-03-13 14:27:32,652 | INFO | Train step | step=48 loss=7.9093 lr=0.000960 tok_per_sec=2,718 grad_norm=1.0888 tokens_seen=98.30K
67
+ 2026-03-13 14:27:33,406 | INFO | Train step | step=49 loss=7.9503 lr=0.000980 tok_per_sec=2,722 grad_norm=1.7942 tokens_seen=100.35K
68
+ 2026-03-13 14:27:34,161 | INFO | Train step | step=50 loss=8.5803 lr=0.001000 tok_per_sec=2,715 grad_norm=1.2601 tokens_seen=102.40K
69
+ 2026-03-13 14:27:34,323 | INFO | Eval step | step=50 val_loss=8.6077 perplexity=5473.91
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"event": "run_started", "timestamp": "2026-03-13T14:30:14", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 500, "warmup_steps": 50, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
2
+ {"event": "runtime_summary", "timestamp": "2026-03-13T14:30:17", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
3
+ {"event": "train", "timestamp": "2026-03-13T14:30:18", "step": 1, "loss": 10.848917245864868, "lr": 2e-05, "tok_per_sec": 1662.7320321814002, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.2317077919724397, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
4
+ {"event": "train", "timestamp": "2026-03-13T14:30:19", "step": 2, "loss": 10.897652626037598, "lr": 4e-05, "tok_per_sec": 2559.899335289762, "grad_norm": 5.0279011726379395, "tokens_seen": 4096, "elapsed_sec": 0.8000314589589834, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
5
+ {"event": "train", "timestamp": "2026-03-13T14:30:19", "step": 3, "loss": 10.785077333450317, "lr": 6e-05, "tok_per_sec": 2701.710782349432, "grad_norm": 5.1141676902771, "tokens_seen": 6144, "elapsed_sec": 0.7580382080050185, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
6
+ {"event": "train", "timestamp": "2026-03-13T14:30:20", "step": 4, "loss": 10.634832859039307, "lr": 8e-05, "tok_per_sec": 2712.1050200194168, "grad_norm": 6.422860622406006, "tokens_seen": 8192, "elapsed_sec": 0.7551329999696463, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
7
+ {"event": "train", "timestamp": "2026-03-13T14:30:21", "step": 5, "loss": 10.747493743896484, "lr": 0.0001, "tok_per_sec": 2709.8966635877678, "grad_norm": 6.580272674560547, "tokens_seen": 10240, "elapsed_sec": 0.7557483750279061, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
8
+ {"event": "train", "timestamp": "2026-03-13T14:30:22", "step": 6, "loss": 10.633646488189697, "lr": 0.00012, "tok_per_sec": 2711.349354437794, "grad_norm": 6.525122165679932, "tokens_seen": 12288, "elapsed_sec": 0.7553434590226971, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
9
+ {"event": "train", "timestamp": "2026-03-13T14:30:22", "step": 7, "loss": 10.477944374084473, "lr": 0.00014000000000000001, "tok_per_sec": 2702.4625638908824, "grad_norm": 5.189583778381348, "tokens_seen": 14336, "elapsed_sec": 0.7578273339895532, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
10
+ {"event": "train", "timestamp": "2026-03-13T14:30:23", "step": 8, "loss": 10.35365605354309, "lr": 0.00016, "tok_per_sec": 2709.4527031511175, "grad_norm": 2.357203960418701, "tokens_seen": 16384, "elapsed_sec": 0.7558722090325318, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
11
+ {"event": "train", "timestamp": "2026-03-13T14:30:24", "step": 9, "loss": 10.339627742767334, "lr": 0.00018, "tok_per_sec": 2711.198605030451, "grad_norm": 4.987534999847412, "tokens_seen": 18432, "elapsed_sec": 0.7553854580037296, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
12
+ {"event": "train", "timestamp": "2026-03-13T14:30:25", "step": 10, "loss": 10.207262754440308, "lr": 0.0002, "tok_per_sec": 2718.8463334608755, "grad_norm": 5.554019451141357, "tokens_seen": 20480, "elapsed_sec": 0.753260666038841, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
13
+ {"event": "eval", "timestamp": "2026-03-13T14:30:25", "step": 10, "val_loss": 10.418238162994385, "perplexity": 33464.423318005785, "eval_batches": 2}
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 14:30:14,410 | INFO | Pretraining started
2
+ 2026-03-13 14:30:14,410 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log
3
+ 2026-03-13 14:30:14,410 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl
4
+ 2026-03-13 14:30:14,410 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
5
+ 2026-03-13 14:30:14,410 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 14:30:14,410 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 500, 'warmup_steps': 50, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
7
+ 2026-03-13 14:30:17,140 | INFO | Device summary | device=mps precision=fp32 compile_model=False
8
+ 2026-03-13 14:30:17,141 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 14:30:17,141 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
10
+ 2026-03-13 14:30:17,141 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
11
+ 2026-03-13 14:30:18,374 | INFO | Train step | step=1 loss=10.8489 lr=0.000020 tok_per_sec=1,663 grad_norm=5.5737 tokens_seen=2.05K
12
+ 2026-03-13 14:30:19,174 | INFO | Train step | step=2 loss=10.8977 lr=0.000040 tok_per_sec=2,560 grad_norm=5.0279 tokens_seen=4.10K
13
+ 2026-03-13 14:30:19,933 | INFO | Train step | step=3 loss=10.7851 lr=0.000060 tok_per_sec=2,702 grad_norm=5.1142 tokens_seen=6.14K
14
+ 2026-03-13 14:30:20,688 | INFO | Train step | step=4 loss=10.6348 lr=0.000080 tok_per_sec=2,712 grad_norm=6.4229 tokens_seen=8.19K
15
+ 2026-03-13 14:30:21,445 | INFO | Train step | step=5 loss=10.7475 lr=0.000100 tok_per_sec=2,710 grad_norm=6.5803 tokens_seen=10.24K
16
+ 2026-03-13 14:30:22,201 | INFO | Train step | step=6 loss=10.6336 lr=0.000120 tok_per_sec=2,711 grad_norm=6.5251 tokens_seen=12.29K
17
+ 2026-03-13 14:30:22,959 | INFO | Train step | step=7 loss=10.4779 lr=0.000140 tok_per_sec=2,702 grad_norm=5.1896 tokens_seen=14.34K
18
+ 2026-03-13 14:30:23,715 | INFO | Train step | step=8 loss=10.3537 lr=0.000160 tok_per_sec=2,709 grad_norm=2.3572 tokens_seen=16.38K
19
+ 2026-03-13 14:30:24,471 | INFO | Train step | step=9 loss=10.3396 lr=0.000180 tok_per_sec=2,711 grad_norm=4.9875 tokens_seen=18.43K
20
+ 2026-03-13 14:30:25,225 | INFO | Train step | step=10 loss=10.2073 lr=0.000200 tok_per_sec=2,719 grad_norm=5.5540 tokens_seen=20.48K
21
+ 2026-03-13 14:30:25,394 | INFO | Eval step | step=10 val_loss=10.4182 perplexity=33464.42
outputs/pretrain_mps_dryrun/run_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_config": {
3
+ "vocab_size": 49152,
4
+ "max_seq_len": 8192,
5
+ "d_model": 384,
6
+ "n_layers": 32,
7
+ "n_heads": 6,
8
+ "ffn_hidden_dim": 1024,
9
+ "rope_theta": 10000.0,
10
+ "rms_norm_eps": 1e-05,
11
+ "initializer_range": 0.02,
12
+ "dropout": 0.0,
13
+ "tie_word_embeddings": true,
14
+ "bias": false,
15
+ "pad_token_id": 0,
16
+ "bos_token_id": 1,
17
+ "eos_token_id": 2
18
+ },
19
+ "train_config": {
20
+ "seed": 42,
21
+ "train_dir": "data/pretokenized/train",
22
+ "val_dir": "data/pretokenized/val",
23
+ "output_dir": "outputs/pretrain_mps_dryrun",
24
+ "checkpoint_dir": "checkpoints/pretrain_mps_dryrun",
25
+ "init_from": null,
26
+ "resume_from": null,
27
+ "seq_len": 512,
28
+ "micro_batch_size": 1,
29
+ "grad_accum_steps": 4,
30
+ "max_steps": 500,
31
+ "warmup_steps": 50,
32
+ "learning_rate": 0.001,
33
+ "min_lr": 0.0001,
34
+ "weight_decay": 0.1,
35
+ "beta1": 0.9,
36
+ "beta2": 0.95,
37
+ "grad_clip": 1.0,
38
+ "precision": "fp32",
39
+ "num_workers": 0,
40
+ "log_interval": 1,
41
+ "eval_interval": 10,
42
+ "eval_batches": 2,
43
+ "save_interval": 10,
44
+ "compile_model": false
45
+ }
46
+ }
outputs/pretrain_stage1/.ipynb_checkpoints/run_config-checkpoint.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_config": {
3
+ "vocab_size": 49152,
4
+ "max_seq_len": 8192,
5
+ "d_model": 384,
6
+ "n_layers": 32,
7
+ "n_heads": 6,
8
+ "ffn_hidden_dim": 1024,
9
+ "rope_theta": 10000.0,
10
+ "rms_norm_eps": 1e-05,
11
+ "initializer_range": 0.02,
12
+ "dropout": 0.0,
13
+ "tie_word_embeddings": true,
14
+ "bias": false,
15
+ "pad_token_id": 0,
16
+ "bos_token_id": 1,
17
+ "eos_token_id": 2
18
+ },
19
+ "train_config": {
20
+ "seed": 42,
21
+ "train_dir": "data/pretokenized/train",
22
+ "val_dir": "data/pretokenized/val",
23
+ "output_dir": "outputs/pretrain_stage1",
24
+ "checkpoint_dir": "checkpoints/pretrain_stage1",
25
+ "init_from": null,
26
+ "resume_from": null,
27
+ "seq_len": 2048,
28
+ "micro_batch_size": 8,
29
+ "grad_accum_steps": 32,
30
+ "max_steps": 20000,
31
+ "warmup_steps": 2000,
32
+ "learning_rate": 0.003,
33
+ "min_lr": 0.0003,
34
+ "weight_decay": 0.1,
35
+ "beta1": 0.9,
36
+ "beta2": 0.95,
37
+ "grad_clip": 1.0,
38
+ "precision": "bf16",
39
+ "num_workers": 0,
40
+ "log_interval": 10,
41
+ "eval_interval": 250,
42
+ "eval_batches": 50,
43
+ "save_interval": 100,
44
+ "compile_model": false
45
+ }
46
+ }
outputs/pretrain_stage1/logs/.ipynb_checkpoints/train_pretrain_20260313_152202-checkpoint.log ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-13 15:22:02,275 | INFO | Pretraining started
2
+ 2026-03-13 15:22:02,276 | INFO | Log file: outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.log
3
+ 2026-03-13 15:22:02,276 | INFO | Metrics JSONL: outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.jsonl
4
+ 2026-03-13 15:22:02,276 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_5090_stage1.json max_steps_override=None
5
+ 2026-03-13 15:22:02,276 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
6
+ 2026-03-13 15:22:02,276 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_stage1', 'checkpoint_dir': 'checkpoints/pretrain_stage1', 'init_from': None, 'resume_from': None, 'seq_len': 2048, 'micro_batch_size': 8, 'grad_accum_steps': 32, 'max_steps': 20000, 'warmup_steps': 2000, 'learning_rate': 0.003, 'min_lr': 0.0003, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'bf16', 'num_workers': 0, 'log_interval': 10, 'eval_interval': 250, 'eval_batches': 50, 'save_interval': 100, 'compile_model': False}
7
+ 2026-03-13 15:22:05,635 | INFO | Device summary | device=cuda precision=bf16 compile_model=False
8
+ 2026-03-13 15:22:05,636 | INFO | Model summary | parameters=75.57M
9
+ 2026-03-13 15:22:05,636 | INFO | Batch summary | seq_len=2048 micro_batch_size=8 grad_accum_steps=32 tokens_per_step=524,288
10
+ 2026-03-13 15:22:05,636 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
11
+ 2026-03-13 15:22:48,364 | INFO | Train step | step=10 loss=10.7962 lr=0.000015 tok_per_sec=122,709 grad_norm=2.2954 tokens_seen=5.24M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
12
+ 2026-03-13 15:23:29,710 | INFO | Train step | step=20 loss=10.3929 lr=0.000030 tok_per_sec=126,809 grad_norm=1.6374 tokens_seen=10.49M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
13
+ 2026-03-13 15:24:11,004 | INFO | Train step | step=30 loss=10.1422 lr=0.000045 tok_per_sec=126,967 grad_norm=1.6471 tokens_seen=15.73M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
14
+ 2026-03-13 15:24:53,082 | INFO | Train step | step=40 loss=9.9494 lr=0.000060 tok_per_sec=124,605 grad_norm=1.5930 tokens_seen=20.97M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
15
+ 2026-03-13 15:25:36,401 | INFO | Train step | step=50 loss=9.6967 lr=0.000075 tok_per_sec=121,032 grad_norm=1.5725 tokens_seen=26.21M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
16
+ 2026-03-13 15:26:19,167 | INFO | Train step | step=60 loss=9.3897 lr=0.000090 tok_per_sec=122,597 grad_norm=1.5564 tokens_seen=31.46M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
17
+ 2026-03-13 15:27:01,620 | INFO | Train step | step=70 loss=9.0575 lr=0.000105 tok_per_sec=123,501 grad_norm=1.5012 tokens_seen=36.70M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
18
+ 2026-03-13 15:27:43,580 | INFO | Train step | step=80 loss=8.6948 lr=0.000120 tok_per_sec=124,954 grad_norm=1.5047 tokens_seen=41.94M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
19
+ 2026-03-13 15:28:26,150 | INFO | Train step | step=90 loss=8.3511 lr=0.000135 tok_per_sec=123,163 grad_norm=1.2600 tokens_seen=47.19M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
20
+ 2026-03-13 15:29:09,001 | INFO | Train step | step=100 loss=8.0548 lr=0.000150 tok_per_sec=122,354 grad_norm=0.9670 tokens_seen=52.43M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
21
+ 2026-03-13 15:29:12,440 | INFO | Checkpoint saved | step=100 step_checkpoint=checkpoints/pretrain_stage1/step_0000100.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
22
+ 2026-03-13 15:29:54,815 | INFO | Train step | step=110 loss=7.8111 lr=0.000165 tok_per_sec=114,442 grad_norm=0.7305 tokens_seen=57.67M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
23
+ 2026-03-13 15:30:37,782 | INFO | Train step | step=120 loss=7.6241 lr=0.000180 tok_per_sec=122,024 grad_norm=0.5833 tokens_seen=62.91M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
24
+ 2026-03-13 15:31:19,995 | INFO | Train step | step=130 loss=7.4835 lr=0.000195 tok_per_sec=124,205 grad_norm=1.0428 tokens_seen=68.16M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
25
+ 2026-03-13 15:32:03,842 | INFO | Train step | step=140 loss=7.3397 lr=0.000210 tok_per_sec=119,576 grad_norm=0.6136 tokens_seen=73.40M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
26
+ 2026-03-13 15:32:45,816 | INFO | Train step | step=150 loss=7.1952 lr=0.000225 tok_per_sec=124,911 grad_norm=1.2209 tokens_seen=78.64M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
27
+ 2026-03-13 15:33:27,516 | INFO | Train step | step=160 loss=7.0569 lr=0.000240 tok_per_sec=125,732 grad_norm=0.9325 tokens_seen=83.89M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
28
+ 2026-03-13 15:34:09,763 | INFO | Train step | step=170 loss=6.9308 lr=0.000255 tok_per_sec=124,102 grad_norm=1.1994 tokens_seen=89.13M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
29
+ 2026-03-13 15:34:51,580 | INFO | Train step | step=180 loss=6.7975 lr=0.000270 tok_per_sec=125,380 grad_norm=1.2646 tokens_seen=94.37M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
30
+ 2026-03-13 15:35:33,306 | INFO | Train step | step=190 loss=6.6834 lr=0.000285 tok_per_sec=125,653 grad_norm=0.9549 tokens_seen=99.61M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
31
+ 2026-03-13 15:36:16,369 | INFO | Train step | step=200 loss=6.5762 lr=0.000300 tok_per_sec=121,752 grad_norm=1.5983 tokens_seen=104.86M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
32
+ 2026-03-13 15:36:18,831 | INFO | Checkpoint saved | step=200 step_checkpoint=checkpoints/pretrain_stage1/step_0000200.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
33
+ 2026-03-13 15:37:01,458 | INFO | Train step | step=210 loss=6.4800 lr=0.000315 tok_per_sec=116,281 grad_norm=0.9575 tokens_seen=110.10M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
34
+ 2026-03-13 15:37:43,418 | INFO | Train step | step=220 loss=6.3799 lr=0.000330 tok_per_sec=124,955 grad_norm=1.0189 tokens_seen=115.34M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
35
+ 2026-03-13 15:38:25,012 | INFO | Train step | step=230 loss=6.3007 lr=0.000345 tok_per_sec=126,050 grad_norm=1.4322 tokens_seen=120.59M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
36
+ 2026-03-13 15:39:07,059 | INFO | Train step | step=240 loss=6.2100 lr=0.000360 tok_per_sec=124,696 grad_norm=1.4284 tokens_seen=125.83M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
37
+ 2026-03-13 15:39:48,188 | INFO | Train step | step=250 loss=6.1378 lr=0.000375 tok_per_sec=127,476 grad_norm=0.8502 tokens_seen=131.07M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
38
+ 2026-03-13 15:39:50,336 | INFO | Eval step | step=250 val_loss=6.1302 perplexity=459.54
39
+ 2026-03-13 15:40:32,241 | INFO | Train step | step=260 loss=6.0712 lr=0.000390 tok_per_sec=119,017 grad_norm=1.5691 tokens_seen=136.31M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
40
+ 2026-03-13 15:41:15,172 | INFO | Train step | step=270 loss=6.0020 lr=0.000405 tok_per_sec=122,129 grad_norm=1.3161 tokens_seen=141.56M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
41
+ 2026-03-13 15:41:57,319 | INFO | Train step | step=280 loss=5.9392 lr=0.000420 tok_per_sec=124,398 grad_norm=1.3891 tokens_seen=146.80M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
42
+ 2026-03-13 15:42:39,253 | INFO | Train step | step=290 loss=5.8713 lr=0.000435 tok_per_sec=125,030 grad_norm=1.1325 tokens_seen=152.04M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
43
+ 2026-03-13 15:43:21,127 | INFO | Train step | step=300 loss=5.8109 lr=0.000450 tok_per_sec=125,209 grad_norm=1.0078 tokens_seen=157.29M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
44
+ 2026-03-13 15:43:23,493 | INFO | Checkpoint saved | step=300 step_checkpoint=checkpoints/pretrain_stage1/step_0000300.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
45
+ 2026-03-13 15:44:04,852 | INFO | Train step | step=310 loss=5.7384 lr=0.000465 tok_per_sec=119,907 grad_norm=1.2581 tokens_seen=162.53M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
46
+ 2026-03-13 15:44:46,724 | INFO | Train step | step=320 loss=5.6798 lr=0.000480 tok_per_sec=125,216 grad_norm=0.9680 tokens_seen=167.77M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
47
+ 2026-03-13 15:45:29,877 | INFO | Train step | step=330 loss=5.6204 lr=0.000495 tok_per_sec=121,497 grad_norm=1.4606 tokens_seen=173.02M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
48
+ 2026-03-13 15:46:12,195 | INFO | Train step | step=340 loss=5.5678 lr=0.000510 tok_per_sec=123,896 grad_norm=1.2717 tokens_seen=178.26M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
49
+ 2026-03-13 15:46:54,104 | INFO | Train step | step=350 loss=5.5266 lr=0.000525 tok_per_sec=125,105 grad_norm=1.6313 tokens_seen=183.50M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
50
+ 2026-03-13 15:47:36,611 | INFO | Train step | step=360 loss=5.4781 lr=0.000540 tok_per_sec=123,343 grad_norm=1.0196 tokens_seen=188.74M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
51
+ 2026-03-13 15:48:18,069 | INFO | Train step | step=370 loss=5.4230 lr=0.000555 tok_per_sec=126,468 grad_norm=1.0206 tokens_seen=193.99M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
52
+ 2026-03-13 15:49:00,176 | INFO | Train step | step=380 loss=5.3519 lr=0.000570 tok_per_sec=124,516 grad_norm=0.7121 tokens_seen=199.23M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
53
+ 2026-03-13 15:49:42,492 | INFO | Train step | step=390 loss=5.3026 lr=0.000585 tok_per_sec=123,899 grad_norm=1.0407 tokens_seen=204.47M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
54
+ 2026-03-13 15:50:24,343 | INFO | Train step | step=400 loss=5.2721 lr=0.000600 tok_per_sec=125,278 grad_norm=0.7830 tokens_seen=209.72M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
55
+ 2026-03-13 15:50:27,093 | INFO | Checkpoint saved | step=400 step_checkpoint=checkpoints/pretrain_stage1/step_0000400.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
56
+ 2026-03-13 15:51:08,698 | INFO | Train step | step=410 loss=5.2136 lr=0.000615 tok_per_sec=118,206 grad_norm=0.6625 tokens_seen=214.96M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
57
+ 2026-03-13 15:51:50,429 | INFO | Train step | step=420 loss=5.1839 lr=0.000630 tok_per_sec=125,640 grad_norm=1.1878 tokens_seen=220.20M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
58
+ 2026-03-13 15:52:31,919 | INFO | Train step | step=430 loss=5.1433 lr=0.000645 tok_per_sec=126,367 grad_norm=1.0909 tokens_seen=225.44M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
59
+ 2026-03-13 15:53:14,261 | INFO | Train step | step=440 loss=5.0811 lr=0.000660 tok_per_sec=123,827 grad_norm=1.0818 tokens_seen=230.69M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
60
+ 2026-03-13 15:53:55,426 | INFO | Train step | step=450 loss=5.0691 lr=0.000675 tok_per_sec=127,367 grad_norm=0.8735 tokens_seen=235.93M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
61
+ 2026-03-13 15:54:36,815 | INFO | Train step | step=460 loss=5.0245 lr=0.000690 tok_per_sec=126,676 grad_norm=0.7781 tokens_seen=241.17M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
62
+ 2026-03-13 15:55:18,411 | INFO | Train step | step=470 loss=4.9740 lr=0.000705 tok_per_sec=126,046 grad_norm=0.8157 tokens_seen=246.42M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
63
+ 2026-03-13 15:55:59,935 | INFO | Train step | step=480 loss=4.9158 lr=0.000720 tok_per_sec=126,265 grad_norm=0.4327 tokens_seen=251.66M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
64
+ 2026-03-13 15:56:41,577 | INFO | Train step | step=490 loss=4.8794 lr=0.000735 tok_per_sec=125,907 grad_norm=0.9491 tokens_seen=256.90M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
65
+ 2026-03-13 15:57:23,706 | INFO | Train step | step=500 loss=4.8574 lr=0.000750 tok_per_sec=124,451 grad_norm=0.7693 tokens_seen=262.14M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
66
+ 2026-03-13 15:57:26,171 | INFO | Eval step | step=500 val_loss=4.8718 perplexity=130.55
67
+ 2026-03-13 15:57:30,367 | INFO | Checkpoint saved | step=500 step_checkpoint=checkpoints/pretrain_stage1/step_0000500.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
68
+ 2026-03-13 15:58:13,490 | INFO | Train step | step=510 loss=4.8269 lr=0.000765 tok_per_sec=105,314 grad_norm=0.8741 tokens_seen=267.39M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
69
+ 2026-03-13 15:58:56,422 | INFO | Train step | step=520 loss=4.7787 lr=0.000780 tok_per_sec=122,122 grad_norm=0.5603 tokens_seen=272.63M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
70
+ 2026-03-13 15:59:39,461 | INFO | Train step | step=530 loss=4.7081 lr=0.000795 tok_per_sec=121,823 grad_norm=0.7208 tokens_seen=277.87M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
71
+ 2026-03-13 16:00:23,234 | INFO | Train step | step=540 loss=4.6785 lr=0.000810 tok_per_sec=119,776 grad_norm=0.6182 tokens_seen=283.12M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
72
+ 2026-03-13 16:01:05,239 | INFO | Train step | step=550 loss=4.6483 lr=0.000825 tok_per_sec=124,821 grad_norm=0.8779 tokens_seen=288.36M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
73
+ 2026-03-13 16:01:47,156 | INFO | Train step | step=560 loss=4.6100 lr=0.000840 tok_per_sec=125,080 grad_norm=0.7765 tokens_seen=293.60M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
74
+ 2026-03-13 16:02:30,946 | INFO | Train step | step=570 loss=4.5568 lr=0.000855 tok_per_sec=119,733 grad_norm=0.5192 tokens_seen=298.84M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
75
+ 2026-03-13 16:03:13,505 | INFO | Train step | step=580 loss=4.5020 lr=0.000870 tok_per_sec=123,194 grad_norm=0.4420 tokens_seen=304.09M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
76
+ 2026-03-13 16:03:55,388 | INFO | Train step | step=590 loss=4.4536 lr=0.000885 tok_per_sec=125,182 grad_norm=0.4726 tokens_seen=309.33M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
77
+ 2026-03-13 16:04:37,233 | INFO | Train step | step=600 loss=4.4008 lr=0.000900 tok_per_sec=125,295 grad_norm=0.5401 tokens_seen=314.57M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
78
+ 2026-03-13 16:04:40,532 | INFO | Checkpoint saved | step=600 step_checkpoint=checkpoints/pretrain_stage1/step_0000600.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
79
+ 2026-03-13 16:05:22,263 | INFO | Train step | step=610 loss=4.3697 lr=0.000915 tok_per_sec=116,433 grad_norm=0.5282 tokens_seen=319.82M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
80
+ 2026-03-13 16:06:04,579 | INFO | Train step | step=620 loss=4.3184 lr=0.000930 tok_per_sec=123,903 grad_norm=0.8301 tokens_seen=325.06M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
81
+ 2026-03-13 16:06:47,513 | INFO | Train step | step=630 loss=4.3098 lr=0.000945 tok_per_sec=122,117 grad_norm=0.4351 tokens_seen=330.30M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
82
+ 2026-03-13 16:07:29,171 | INFO | Train step | step=640 loss=4.2368 lr=0.000960 tok_per_sec=125,858 grad_norm=0.4222 tokens_seen=335.54M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.log ADDED
The diff for this file is too large to render. See raw diff
 
outputs/pretrain_stage1/run_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_config": {
3
+ "vocab_size": 49152,
4
+ "max_seq_len": 8192,
5
+ "d_model": 384,
6
+ "n_layers": 32,
7
+ "n_heads": 6,
8
+ "ffn_hidden_dim": 1024,
9
+ "rope_theta": 10000.0,
10
+ "rms_norm_eps": 1e-05,
11
+ "initializer_range": 0.02,
12
+ "dropout": 0.0,
13
+ "tie_word_embeddings": true,
14
+ "bias": false,
15
+ "pad_token_id": 0,
16
+ "bos_token_id": 1,
17
+ "eos_token_id": 2
18
+ },
19
+ "train_config": {
20
+ "seed": 42,
21
+ "train_dir": "data/pretokenized/train",
22
+ "val_dir": "data/pretokenized/val",
23
+ "output_dir": "outputs/pretrain_stage1",
24
+ "checkpoint_dir": "checkpoints/pretrain_stage1",
25
+ "init_from": null,
26
+ "resume_from": null,
27
+ "seq_len": 2048,
28
+ "micro_batch_size": 8,
29
+ "grad_accum_steps": 32,
30
+ "max_steps": 20000,
31
+ "warmup_steps": 2000,
32
+ "learning_rate": 0.003,
33
+ "min_lr": 0.0003,
34
+ "weight_decay": 0.1,
35
+ "beta1": 0.9,
36
+ "beta2": 0.95,
37
+ "grad_clip": 1.0,
38
+ "precision": "bf16",
39
+ "num_workers": 0,
40
+ "log_interval": 10,
41
+ "eval_interval": 250,
42
+ "eval_batches": 50,
43
+ "save_interval": 100,
44
+ "compile_model": false
45
+ }
46
+ }
outputs/pretrain_stage2/run_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_config": {
3
+ "vocab_size": 49152,
4
+ "max_seq_len": 8192,
5
+ "d_model": 384,
6
+ "n_layers": 32,
7
+ "n_heads": 6,
8
+ "ffn_hidden_dim": 1024,
9
+ "rope_theta": 10000.0,
10
+ "rms_norm_eps": 1e-05,
11
+ "initializer_range": 0.02,
12
+ "dropout": 0.0,
13
+ "tie_word_embeddings": true,
14
+ "bias": false,
15
+ "pad_token_id": 0,
16
+ "bos_token_id": 1,
17
+ "eos_token_id": 2
18
+ },
19
+ "train_config": {
20
+ "seed": 42,
21
+ "train_dir": "data/pretokenized/train",
22
+ "val_dir": "data/pretokenized/val",
23
+ "output_dir": "outputs/pretrain_stage2",
24
+ "checkpoint_dir": "checkpoints/pretrain_stage2",
25
+ "init_from": "checkpoints/pretrain_stage1/last.pt",
26
+ "resume_from": null,
27
+ "seq_len": 8192,
28
+ "micro_batch_size": 2,
29
+ "grad_accum_steps": 16,
30
+ "max_steps": 1000,
31
+ "warmup_steps": 100,
32
+ "learning_rate": 0.001,
33
+ "min_lr": 0.0001,
34
+ "weight_decay": 0.1,
35
+ "beta1": 0.9,
36
+ "beta2": 0.95,
37
+ "grad_clip": 1.0,
38
+ "precision": "bf16",
39
+ "num_workers": 0,
40
+ "log_interval": 5,
41
+ "eval_interval": 100,
42
+ "eval_batches": 20,
43
+ "save_interval": 50,
44
+ "compile_model": false
45
+ }
46
+ }
scripts/.DS_Store ADDED
Binary file (6.15 kB). View file
 
scripts/eval_perplexity.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import math
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+
11
+ ROOT = Path(__file__).resolve().parents[1]
12
+ sys.path.append(str(ROOT / "src"))
13
+
14
+ from sllm.checkpoint import load_checkpoint
15
+ from sllm.config import ModelConfig, load_json
16
+ from sllm.data import SequentialEvalDataset
17
+ from sllm.model import SLLMForCausalLM
18
+ from sllm.utils import autocast_context, get_device, resolve_runtime_precision, setup_logger
19
+
20
+
21
+ def build_parser() -> argparse.ArgumentParser:
22
+ parser = argparse.ArgumentParser(description="Evaluate perplexity on validation shards.")
23
+ parser.add_argument("--checkpoint", required=True, help="Path to checkpoint file.")
24
+ parser.add_argument("--model-config", required=False, help="Optional model config JSON path.")
25
+ parser.add_argument("--data-dir", required=True, help="Validation root directory.")
26
+ parser.add_argument("--seq-len", type=int, default=2_048)
27
+ parser.add_argument("--batch-size", type=int, default=8)
28
+ parser.add_argument("--batches", type=int, default=50)
29
+ parser.add_argument("--precision", default="bf16")
30
+ return parser
31
+
32
+
33
+ def main() -> None:
34
+ args = build_parser().parse_args()
35
+ logger, log_path = setup_logger("sllm.eval_perplexity", Path("outputs/eval"), "eval_perplexity")
36
+ logger.info("Perplexity evaluation started")
37
+ logger.info("Log file: %s", log_path)
38
+ logger.info("Arguments | checkpoint=%s model_config=%s data_dir=%s seq_len=%s batch_size=%s batches=%s precision=%s", args.checkpoint, args.model_config, args.data_dir, args.seq_len, args.batch_size, args.batches, args.precision)
39
+ device = get_device()
40
+ runtime_precision, precision_warning = resolve_runtime_precision(device, args.precision)
41
+ if precision_warning is not None:
42
+ logger.warning(precision_warning)
43
+ payload = load_checkpoint(args.checkpoint, map_location=device)
44
+ if args.model_config:
45
+ model_config = ModelConfig.from_dict(load_json(args.model_config))
46
+ else:
47
+ model_config = ModelConfig.from_dict(payload["model_config"])
48
+
49
+ model = SLLMForCausalLM(model_config).to(device)
50
+ model.load_state_dict(payload["model"])
51
+ model.eval()
52
+
53
+ dataset = SequentialEvalDataset(
54
+ data_dir=args.data_dir,
55
+ split="val",
56
+ seq_len=args.seq_len,
57
+ max_batches=args.batches * args.batch_size,
58
+ )
59
+ loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=0)
60
+
61
+ losses = []
62
+ with torch.no_grad():
63
+ for batch_index, batch in enumerate(loader):
64
+ if batch_index >= args.batches:
65
+ break
66
+ batch = {key: value.to(device) for key, value in batch.items()}
67
+ with autocast_context(device, runtime_precision):
68
+ loss = model(**batch)["loss"]
69
+ losses.append(loss.detach().float().item())
70
+
71
+ mean_loss = float(sum(losses) / max(1, len(losses)))
72
+ perplexity = math.exp(min(mean_loss, 20))
73
+ logger.info("Perplexity evaluation finished | val_loss=%.4f perplexity=%.2f", mean_loss, perplexity)
74
+ print(f"val_loss={mean_loss:.4f}")
75
+ print(f"perplexity={perplexity:.2f}")
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main()
scripts/generate.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ from tokenizers import Tokenizer
9
+
10
+ ROOT = Path(__file__).resolve().parents[1]
11
+ sys.path.append(str(ROOT / "src"))
12
+
13
+ from sllm.checkpoint import load_checkpoint
14
+ from sllm.config import ModelConfig, load_json
15
+ from sllm.model import SLLMForCausalLM
16
+ from sllm.utils import get_device, setup_logger
17
+
18
+
19
+ def build_parser() -> argparse.ArgumentParser:
20
+ parser = argparse.ArgumentParser(description="Generate text from a trained checkpoint.")
21
+ parser.add_argument("--checkpoint", required=True, help="Path to model checkpoint.")
22
+ parser.add_argument("--tokenizer-dir", required=True, help="Directory with tokenizer.json.")
23
+ parser.add_argument("--prompt", required=True, help="Prompt text.")
24
+ parser.add_argument("--max-new-tokens", type=int, default=128)
25
+ parser.add_argument("--temperature", type=float, default=0.8)
26
+ parser.add_argument("--top-k", type=int, default=50)
27
+ parser.add_argument("--model-config", required=False, help="Optional path to model config JSON.")
28
+ return parser
29
+
30
+
31
+ def main() -> None:
32
+ args = build_parser().parse_args()
33
+ logger, log_path = setup_logger("sllm.generate", Path("outputs/generate"), "generate")
34
+ logger.info("Generation started")
35
+ logger.info("Log file: %s", log_path)
36
+ logger.info(
37
+ "Arguments | checkpoint=%s tokenizer_dir=%s max_new_tokens=%s temperature=%s top_k=%s model_config=%s",
38
+ args.checkpoint,
39
+ args.tokenizer_dir,
40
+ args.max_new_tokens,
41
+ args.temperature,
42
+ args.top_k,
43
+ args.model_config,
44
+ )
45
+ device = get_device()
46
+ tokenizer = Tokenizer.from_file(str(Path(args.tokenizer_dir) / "tokenizer.json"))
47
+ tokenizer_meta = load_json(Path(args.tokenizer_dir) / "tokenizer_meta.json")
48
+ specials = tokenizer_meta["special_tokens"]
49
+
50
+ payload = load_checkpoint(args.checkpoint, map_location=device)
51
+ if args.model_config:
52
+ model_config = ModelConfig.from_dict(load_json(args.model_config))
53
+ else:
54
+ model_config = ModelConfig.from_dict(payload["model_config"])
55
+
56
+ model = SLLMForCausalLM(model_config).to(device)
57
+ model.load_state_dict(payload["model"])
58
+ model.eval()
59
+
60
+ prompt_ids = [int(specials["bos_token_id"])] + tokenizer.encode(
61
+ args.prompt,
62
+ add_special_tokens=False,
63
+ ).ids
64
+ input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device)
65
+
66
+ with torch.no_grad():
67
+ output_ids = model.generate(
68
+ input_ids=input_ids,
69
+ max_new_tokens=args.max_new_tokens,
70
+ temperature=args.temperature,
71
+ top_k=args.top_k,
72
+ eos_token_id=int(specials["eos_token_id"]),
73
+ )
74
+
75
+ decoded = tokenizer.decode(output_ids[0].tolist(), skip_special_tokens=False)
76
+ logger.info("Generation finished | prompt_tokens=%s output_tokens=%s", len(prompt_ids), output_ids.shape[1])
77
+ print(decoded)
78
+
79
+
80
+ if __name__ == "__main__":
81
+ main()
scripts/prepare_pretrain_data.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import math
5
+ import random
6
+ import sys
7
+ from collections import deque
8
+ from pathlib import Path
9
+
10
+ from datasets import load_dataset
11
+ from tokenizers import Tokenizer
12
+
13
+ ROOT = Path(__file__).resolve().parents[1]
14
+ sys.path.append(str(ROOT / "src"))
15
+
16
+ from sllm.config import DataMixConfig, load_json, save_json
17
+ from sllm.data import TokenShardWriter
18
+ from sllm.utils import setup_logger
19
+
20
+
21
+ def build_parser() -> argparse.ArgumentParser:
22
+ parser = argparse.ArgumentParser(description="Tokenize and shard pretraining corpora.")
23
+ parser.add_argument("--data-config", required=True, help="Path to data mixture JSON config.")
24
+ parser.add_argument("--tokenizer-dir", required=True, help="Directory with tokenizer.json.")
25
+ parser.add_argument("--output-dir", required=True, help="Root directory for train/val shards.")
26
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for dataset shuffling.")
27
+ return parser
28
+
29
+
30
+ def load_tokenizer(tokenizer_dir: str | Path) -> tuple[Tokenizer, dict]:
31
+ tokenizer_dir = Path(tokenizer_dir)
32
+ tokenizer = Tokenizer.from_file(str(tokenizer_dir / "tokenizer.json"))
33
+ metadata = load_json(tokenizer_dir / "tokenizer_meta.json")
34
+ return tokenizer, metadata
35
+
36
+
37
+ def iter_source_rows(source, seed: int):
38
+ dataset = load_dataset(
39
+ path=source.path,
40
+ name=source.config_name,
41
+ data_dir=source.data_dir,
42
+ split=source.split,
43
+ revision=source.revision,
44
+ streaming=source.streaming,
45
+ )
46
+ if source.streaming:
47
+ dataset = dataset.shuffle(seed=seed, buffer_size=source.shuffle_buffer)
48
+ return iter(dataset)
49
+
50
+
51
+ TOKENIZE_BATCH_SIZE = 128
52
+
53
+
54
+ def allocate_token_targets(data_config: DataMixConfig, total_tokens: int) -> dict[str, int]:
55
+ weights = data_config.normalized_weights()
56
+ raw_targets = {
57
+ source.name: total_tokens * weights[source.name]
58
+ for source in data_config.sources
59
+ }
60
+ base_targets = {
61
+ name: int(math.floor(value))
62
+ for name, value in raw_targets.items()
63
+ }
64
+ remainder = total_tokens - sum(base_targets.values())
65
+ ranked = sorted(
66
+ raw_targets.items(),
67
+ key=lambda item: (item[1] - math.floor(item[1]), item[0]),
68
+ reverse=True,
69
+ )
70
+ for index in range(remainder):
71
+ name = ranked[index % len(ranked)][0]
72
+ base_targets[name] += 1
73
+ return base_targets
74
+
75
+
76
+ def make_source_state(source, seed: int) -> dict:
77
+ return {
78
+ "source": source,
79
+ "iterator": iter_source_rows(source, seed),
80
+ "documents_used": 0,
81
+ "train_tokens_written": 0,
82
+ "val_tokens_written": 0,
83
+ "exhausted": False,
84
+ "token_queue": deque(),
85
+ }
86
+
87
+
88
+ def refill_token_queue(state: dict, tokenizer: Tokenizer) -> None:
89
+ if state["exhausted"]:
90
+ return
91
+
92
+ texts: list[str] = []
93
+ while len(texts) < TOKENIZE_BATCH_SIZE:
94
+ try:
95
+ row = next(state["iterator"])
96
+ except StopIteration:
97
+ state["exhausted"] = True
98
+ break
99
+
100
+ text = row.get(state["source"].text_field or "", None)
101
+ if not isinstance(text, str):
102
+ continue
103
+ text = text.strip()
104
+ if not text:
105
+ continue
106
+ texts.append(text)
107
+
108
+ if not texts:
109
+ return
110
+
111
+ encoded_batch = tokenizer.encode_batch(texts)
112
+ for encoded in encoded_batch:
113
+ token_ids = encoded.ids
114
+ if token_ids:
115
+ state["token_queue"].append(token_ids)
116
+
117
+
118
+ def next_valid_token_ids(state: dict, tokenizer: Tokenizer) -> list[int] | None:
119
+ while True:
120
+ if state["token_queue"]:
121
+ state["documents_used"] += 1
122
+ return state["token_queue"].popleft()
123
+ if state["exhausted"]:
124
+ return None
125
+ refill_token_queue(state, tokenizer)
126
+
127
+
128
+ def choose_source_name(states: dict[str, dict], targets: dict[str, int], split: str, rng: random.Random) -> str | None:
129
+ candidates = []
130
+ for name, state in states.items():
131
+ if state["exhausted"]:
132
+ continue
133
+ target = targets[name]
134
+ if target <= 0:
135
+ continue
136
+ written = state[f"{split}_tokens_written"]
137
+ if written >= target:
138
+ continue
139
+ progress = written / target
140
+ candidates.append((progress, rng.random(), name))
141
+
142
+ if not candidates:
143
+ return None
144
+
145
+ candidates.sort(key=lambda item: (item[0], item[1]))
146
+ return candidates[0][2]
147
+
148
+
149
+ def interleave_split(
150
+ split: str,
151
+ writer: TokenShardWriter,
152
+ states: dict[str, dict],
153
+ targets: dict[str, int],
154
+ tokenizer: Tokenizer,
155
+ logger,
156
+ rng: random.Random,
157
+ ) -> int:
158
+ total_target = sum(targets.values())
159
+ total_written = 0
160
+ emitted_documents = 0
161
+
162
+ logger.info(
163
+ "Interleave start | split=%s total_target_tokens=%s strategy=weighted_progress_balancing",
164
+ split,
165
+ f"{total_target:,}",
166
+ )
167
+
168
+ while total_written < total_target:
169
+ source_name = choose_source_name(states, targets, split, rng)
170
+ if source_name is None:
171
+ raise RuntimeError(
172
+ f"Недостаточно данных для заполнения split={split}. "
173
+ "Все доступные источники исчерпаны до достижения целевого объема."
174
+ )
175
+
176
+ state = states[source_name]
177
+ token_ids = next_valid_token_ids(state, tokenizer)
178
+ if token_ids is None:
179
+ logger.warning("Source exhausted early | split=%s source=%s", split, source_name)
180
+ continue
181
+
182
+ source_remaining = targets[source_name] - state[f"{split}_tokens_written"]
183
+ split_remaining = total_target - total_written
184
+ chunk = token_ids[: min(len(token_ids), source_remaining, split_remaining)]
185
+ if not chunk:
186
+ continue
187
+
188
+ writer.add_tokens(chunk)
189
+ state[f"{split}_tokens_written"] += len(chunk)
190
+ total_written += len(chunk)
191
+ emitted_documents += 1
192
+
193
+ if emitted_documents % 10_000 == 0:
194
+ logger.info(
195
+ "Interleave progress | split=%s documents=%s total_tokens=%s/%s current_source=%s",
196
+ split,
197
+ f"{emitted_documents:,}",
198
+ f"{total_written:,}",
199
+ f"{total_target:,}",
200
+ source_name,
201
+ )
202
+
203
+ logger.info(
204
+ "Interleave done | split=%s documents=%s total_tokens=%s",
205
+ split,
206
+ f"{emitted_documents:,}",
207
+ f"{total_written:,}",
208
+ )
209
+ return total_written
210
+
211
+
212
+ def main() -> None:
213
+ args = build_parser().parse_args()
214
+ data_config = DataMixConfig.from_dict(load_json(args.data_config))
215
+ tokenizer, tokenizer_meta = load_tokenizer(args.tokenizer_dir)
216
+ output_dir = Path(args.output_dir)
217
+ train_dir = output_dir / "train"
218
+ val_dir = output_dir / "val"
219
+ train_dir.mkdir(parents=True, exist_ok=True)
220
+ val_dir.mkdir(parents=True, exist_ok=True)
221
+ logger, log_path = setup_logger("sllm.prepare_pretrain_data", output_dir, "prepare_pretrain_data")
222
+ logger.info("Pretokenization started")
223
+ logger.info("Log file: %s", log_path)
224
+ logger.info("Arguments | data_config=%s tokenizer_dir=%s output_dir=%s seed=%s", args.data_config, args.tokenizer_dir, args.output_dir, args.seed)
225
+ logger.info("Tokenizer meta | vocab_size=%s special_tokens=%s", tokenizer_meta.get("vocab_size"), tokenizer_meta.get("special_tokens"))
226
+ logger.info("Mixing strategy | global interleaving with weighted progress balancing")
227
+ logger.info("Tokenization strategy | encode_batch with batch_size=%s", TOKENIZE_BATCH_SIZE)
228
+
229
+ weight_map = data_config.normalized_weights()
230
+ train_targets = allocate_token_targets(data_config, data_config.train_tokens)
231
+ val_targets = allocate_token_targets(data_config, data_config.val_tokens)
232
+ dataset_summary: dict[str, dict] = {}
233
+ states: dict[str, dict] = {}
234
+
235
+ for index, source in enumerate(data_config.sources):
236
+ states[source.name] = make_source_state(source, args.seed + index)
237
+ logger.info(
238
+ "Source registered | name=%s path=%s data_dir=%s split=%s text_field=%s weight=%.4f train_target=%s val_target=%s streaming=%s",
239
+ source.name,
240
+ source.path,
241
+ source.data_dir,
242
+ source.split,
243
+ source.text_field,
244
+ weight_map[source.name],
245
+ f"{train_targets[source.name]:,}",
246
+ f"{val_targets[source.name]:,}",
247
+ source.streaming,
248
+ )
249
+
250
+ rng_val = random.Random(args.seed + 10_000)
251
+ rng_train = random.Random(args.seed + 20_000)
252
+ val_writer = TokenShardWriter(
253
+ output_dir=val_dir,
254
+ prefix="val",
255
+ shard_size_tokens=max(1_000_000, min(data_config.shard_size_tokens, data_config.val_tokens)),
256
+ )
257
+ train_writer = TokenShardWriter(
258
+ output_dir=train_dir,
259
+ prefix="train",
260
+ shard_size_tokens=data_config.shard_size_tokens,
261
+ )
262
+
263
+ total_val = interleave_split("val", val_writer, states, val_targets, tokenizer, logger, rng_val)
264
+ total_train = interleave_split("train", train_writer, states, train_targets, tokenizer, logger, rng_train)
265
+
266
+ train_shards = train_writer.finalize()
267
+ val_shards = val_writer.finalize()
268
+
269
+ for source in data_config.sources:
270
+ state = states[source.name]
271
+ dataset_summary[source.name] = {
272
+ "path": source.path,
273
+ "data_dir": source.data_dir,
274
+ "split": source.split,
275
+ "train_target_tokens": train_targets[source.name],
276
+ "val_target_tokens": val_targets[source.name],
277
+ "train_tokens_written": state["train_tokens_written"],
278
+ "val_tokens_written": state["val_tokens_written"],
279
+ "documents_used": state["documents_used"],
280
+ }
281
+ logger.info(
282
+ "Source done | name=%s documents=%s train_tokens=%s/%s val_tokens=%s/%s",
283
+ source.name,
284
+ f"{state['documents_used']:,}",
285
+ f"{state['train_tokens_written']:,}",
286
+ f"{train_targets[source.name]:,}",
287
+ f"{state['val_tokens_written']:,}",
288
+ f"{val_targets[source.name]:,}",
289
+ )
290
+
291
+ save_json(
292
+ output_dir / "dataset_summary.json",
293
+ {
294
+ "tokenizer": tokenizer_meta,
295
+ "data_config": data_config.to_dict(),
296
+ "mixing_strategy": "global_interleaving_weighted_progress_balancing",
297
+ "train_target_tokens": data_config.train_tokens,
298
+ "val_target_tokens": data_config.val_tokens,
299
+ "train_tokens_written": total_train,
300
+ "val_tokens_written": total_val,
301
+ "train_shards": len(train_shards),
302
+ "val_shards": len(val_shards),
303
+ "sources": dataset_summary,
304
+ },
305
+ )
306
+ logger.info(
307
+ "Pretokenization finished | output_dir=%s total_train_tokens=%s total_val_tokens=%s train_shards=%s val_shards=%s",
308
+ output_dir,
309
+ f"{total_train:,}",
310
+ f"{total_val:,}",
311
+ len(train_shards),
312
+ len(val_shards),
313
+ )
314
+ logger.info("Dataset summary saved | path=%s", output_dir / "dataset_summary.json")
315
+
316
+
317
+ if __name__ == "__main__":
318
+ main()
scripts/prepare_sft_data.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from datasets import load_dataset
8
+ from tokenizers import Tokenizer
9
+
10
+ ROOT = Path(__file__).resolve().parents[1]
11
+ sys.path.append(str(ROOT / "src"))
12
+
13
+ from sllm.config import load_json, save_json
14
+ from sllm.data import SFTShardWriter
15
+ from sllm.utils import setup_logger
16
+
17
+
18
+ def build_parser() -> argparse.ArgumentParser:
19
+ parser = argparse.ArgumentParser(description="Prepare fixed-length SFT tensors.")
20
+ parser.add_argument("--config", required=True, help="Path to SFT data JSON config.")
21
+ parser.add_argument("--tokenizer-dir", required=True, help="Directory with tokenizer.json and metadata.")
22
+ parser.add_argument("--output-dir", required=True, help="Directory to store processed SFT tensors.")
23
+ parser.add_argument("--seq-len", type=int, default=2_048, help="Packed example length.")
24
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for dataset shuffling.")
25
+ return parser
26
+
27
+
28
+ def load_tokenizer(tokenizer_dir: str | Path) -> tuple[Tokenizer, dict]:
29
+ tokenizer_dir = Path(tokenizer_dir)
30
+ tokenizer = Tokenizer.from_file(str(tokenizer_dir / "tokenizer.json"))
31
+ metadata = load_json(tokenizer_dir / "tokenizer_meta.json")
32
+ return tokenizer, metadata
33
+
34
+
35
+ def row_to_messages(row: dict, config: dict) -> list[dict[str, str]]:
36
+ fmt = config.get("format", "messages")
37
+ if fmt == "messages":
38
+ messages = row.get(config.get("messages_field", "messages"))
39
+ if not isinstance(messages, list):
40
+ raise ValueError("Не найден список сообщений в SFT-датасете.")
41
+ normalized = []
42
+ for message in messages:
43
+ if not isinstance(message, dict):
44
+ continue
45
+ role = message.get("role")
46
+ content = message.get("content")
47
+ if isinstance(content, list):
48
+ parts = [item.get("text", "") for item in content if isinstance(item, dict)]
49
+ content = "\n".join(part for part in parts if part)
50
+ if isinstance(role, str) and isinstance(content, str) and content.strip():
51
+ normalized.append({"role": role, "content": content.strip()})
52
+ return normalized
53
+
54
+ if fmt == "prompt_response":
55
+ prompt = row.get(config.get("prompt_field", "prompt"))
56
+ response = row.get(config.get("response_field", "response"))
57
+ if not isinstance(prompt, str) or not isinstance(response, str):
58
+ raise ValueError("Не найдены поля prompt/response в SFT-датасете.")
59
+ system_prompt = config.get("system_prompt")
60
+ messages = []
61
+ if isinstance(system_prompt, str) and system_prompt.strip():
62
+ messages.append({"role": "system", "content": system_prompt.strip()})
63
+ messages.append({"role": "user", "content": prompt.strip()})
64
+ messages.append({"role": "assistant", "content": response.strip()})
65
+ return messages
66
+
67
+ if fmt == "alpaca":
68
+ instruction = row.get(config.get("instruction_field", "instruction"))
69
+ input_text = row.get(config.get("input_field", "input"), "")
70
+ output_text = row.get(config.get("output_field", "output"))
71
+ if not isinstance(instruction, str) or not isinstance(output_text, str):
72
+ raise ValueError("Не найдены поля instruction/output в Alpaca-подобном датасете.")
73
+ prompt = instruction.strip()
74
+ if isinstance(input_text, str) and input_text.strip():
75
+ prompt = f"{prompt}\n\n{input_text.strip()}"
76
+ return [
77
+ {"role": "user", "content": prompt},
78
+ {"role": "assistant", "content": output_text.strip()},
79
+ ]
80
+
81
+ raise ValueError(f"Unsupported SFT format: {fmt}")
82
+
83
+
84
+ def tokenize_messages(
85
+ tokenizer: Tokenizer,
86
+ messages: list[dict[str, str]],
87
+ bos_id: int,
88
+ eos_id: int,
89
+ ) -> tuple[list[int], list[int]]:
90
+ input_ids = [bos_id]
91
+ labels = [-100]
92
+
93
+ for message in messages:
94
+ role = message["role"].strip().lower()
95
+ content = message["content"].strip()
96
+ if not content:
97
+ continue
98
+ text = f"<|{role}|>\n{content}\n"
99
+ piece = tokenizer.encode(text, add_special_tokens=False).ids
100
+ if not piece:
101
+ continue
102
+ input_ids.extend(piece)
103
+ if role == "assistant":
104
+ labels.extend(piece)
105
+ else:
106
+ labels.extend([-100] * len(piece))
107
+
108
+ input_ids.append(eos_id)
109
+ labels.append(eos_id)
110
+ return input_ids, labels
111
+
112
+
113
+ def pad_or_truncate(
114
+ input_ids: list[int],
115
+ labels: list[int],
116
+ seq_len: int,
117
+ pad_id: int,
118
+ ) -> tuple[list[int], list[int]]:
119
+ input_ids = input_ids[:seq_len]
120
+ labels = labels[:seq_len]
121
+ if len(input_ids) < seq_len:
122
+ pad_length = seq_len - len(input_ids)
123
+ input_ids = input_ids + [pad_id] * pad_length
124
+ labels = labels + [-100] * pad_length
125
+ return input_ids, labels
126
+
127
+
128
+ def main() -> None:
129
+ args = build_parser().parse_args()
130
+ config = load_json(args.config)
131
+ tokenizer, tokenizer_meta = load_tokenizer(args.tokenizer_dir)
132
+ specials = tokenizer_meta["special_tokens"]
133
+ bos_id = int(specials["bos_token_id"])
134
+ eos_id = int(specials["eos_token_id"])
135
+ pad_id = int(specials["pad_token_id"])
136
+
137
+ dataset = load_dataset(
138
+ path=config["path"],
139
+ name=config.get("config_name"),
140
+ split=config.get("split", "train"),
141
+ revision=config.get("revision"),
142
+ streaming=bool(config.get("streaming", False)),
143
+ )
144
+ if config.get("shuffle", True):
145
+ dataset = dataset.shuffle(seed=args.seed)
146
+
147
+ val_examples = int(config.get("val_examples", 1_000))
148
+ output_dir = Path(args.output_dir)
149
+ output_dir.mkdir(parents=True, exist_ok=True)
150
+ logger, log_path = setup_logger("sllm.prepare_sft_data", output_dir, "prepare_sft_data")
151
+ logger.info("SFT data preparation started")
152
+ logger.info("Log file: %s", log_path)
153
+ logger.info(
154
+ "Arguments | config=%s tokenizer_dir=%s output_dir=%s seq_len=%s seed=%s",
155
+ args.config,
156
+ args.tokenizer_dir,
157
+ args.output_dir,
158
+ args.seq_len,
159
+ args.seed,
160
+ )
161
+ logger.info(
162
+ "SFT source config | path=%s config_name=%s split=%s format=%s streaming=%s val_examples=%s max_train_examples=%s",
163
+ config.get("path"),
164
+ config.get("config_name"),
165
+ config.get("split", "train"),
166
+ config.get("format", "messages"),
167
+ bool(config.get("streaming", False)),
168
+ val_examples,
169
+ config.get("max_train_examples"),
170
+ )
171
+ train_writer = SFTShardWriter(output_dir, prefix="train", seq_len=args.seq_len)
172
+ val_writer = SFTShardWriter(output_dir, prefix="val", seq_len=args.seq_len)
173
+
174
+ train_count = 0
175
+ val_count = 0
176
+ max_train_examples = config.get("max_train_examples")
177
+
178
+ for row in dataset:
179
+ messages = row_to_messages(row, config)
180
+ if not messages:
181
+ continue
182
+ input_ids, labels = tokenize_messages(tokenizer, messages, bos_id=bos_id, eos_id=eos_id)
183
+ input_ids, labels = pad_or_truncate(input_ids, labels, args.seq_len, pad_id=pad_id)
184
+
185
+ if val_count < val_examples:
186
+ val_writer.add_example(input_ids, labels)
187
+ val_count += 1
188
+ else:
189
+ train_writer.add_example(input_ids, labels)
190
+ train_count += 1
191
+
192
+ total_examples = train_count + val_count
193
+ if total_examples % 5_000 == 0:
194
+ logger.info(
195
+ "SFT progress | processed=%s train_examples=%s val_examples=%s",
196
+ f"{total_examples:,}",
197
+ f"{train_count:,}",
198
+ f"{val_count:,}",
199
+ )
200
+
201
+ if max_train_examples is not None and train_count >= int(max_train_examples):
202
+ break
203
+
204
+ train_metadata = train_writer.finalize()
205
+ val_metadata = val_writer.finalize()
206
+ save_json(
207
+ output_dir / "dataset_summary.json",
208
+ {
209
+ "config": config,
210
+ "tokenizer_meta": tokenizer_meta,
211
+ "train": train_metadata,
212
+ "val": val_metadata,
213
+ },
214
+ )
215
+ logger.info("SFT dataset saved | output_dir=%s", output_dir)
216
+ logger.info("SFT summary | train_examples=%s val_examples=%s", f"{train_count:,}", f"{val_count:,}")
217
+ logger.info("SFT metadata saved | path=%s", output_dir / "dataset_summary.json")
218
+
219
+
220
+ if __name__ == "__main__":
221
+ main()
scripts/train_pretrain.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import math
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+
12
+ ROOT = Path(__file__).resolve().parents[1]
13
+ sys.path.append(str(ROOT / "src"))
14
+
15
+ from sllm.checkpoint import load_checkpoint, save_checkpoint
16
+ from sllm.config import ModelConfig, TrainConfig, load_json, save_json
17
+ from sllm.data import RandomTokenDataset, SequentialEvalDataset
18
+ from sllm.model import SLLMForCausalLM
19
+ from sllm.utils import (
20
+ append_jsonl,
21
+ autocast_context,
22
+ cosine_lr,
23
+ cuda_memory_snapshot,
24
+ ensure_dir,
25
+ format_number,
26
+ get_device,
27
+ iso_timestamp,
28
+ maybe_enable_tf32,
29
+ model_parameter_count,
30
+ resolve_runtime_precision,
31
+ set_optimizer_lr,
32
+ set_seed,
33
+ setup_logger,
34
+ timestamp,
35
+ tokens_per_step,
36
+ )
37
+
38
+
39
+ def build_parser() -> argparse.ArgumentParser:
40
+ parser = argparse.ArgumentParser(description="Pretrain the small causal LM.")
41
+ parser.add_argument("--model-config", required=True, help="Path to model JSON config.")
42
+ parser.add_argument("--train-config", required=True, help="Path to pretraining JSON config.")
43
+ parser.add_argument(
44
+ "--max-steps",
45
+ type=int,
46
+ default=None,
47
+ help="Optional override for debugging or dry runs.",
48
+ )
49
+ return parser
50
+
51
+
52
+ def build_optimizer(model: torch.nn.Module, config: TrainConfig, device: torch.device):
53
+ decay_params = []
54
+ no_decay_params = []
55
+ for name, parameter in model.named_parameters():
56
+ if not parameter.requires_grad:
57
+ continue
58
+ if parameter.ndim <= 1 or name.endswith("bias"):
59
+ no_decay_params.append(parameter)
60
+ else:
61
+ decay_params.append(parameter)
62
+
63
+ fused_supported = device.type == "cuda"
64
+ return torch.optim.AdamW(
65
+ [
66
+ {"params": decay_params, "weight_decay": config.weight_decay},
67
+ {"params": no_decay_params, "weight_decay": 0.0},
68
+ ],
69
+ lr=config.learning_rate,
70
+ betas=(config.beta1, config.beta2),
71
+ fused=fused_supported,
72
+ )
73
+
74
+
75
+ @torch.no_grad()
76
+ def evaluate(
77
+ model: SLLMForCausalLM,
78
+ config: TrainConfig,
79
+ device: torch.device,
80
+ ) -> tuple[float, float]:
81
+ model.eval()
82
+ dataset = SequentialEvalDataset(
83
+ data_dir=config.val_dir,
84
+ split="val",
85
+ seq_len=config.seq_len,
86
+ max_batches=config.eval_batches * config.micro_batch_size,
87
+ )
88
+ loader = DataLoader(dataset, batch_size=config.micro_batch_size, num_workers=0)
89
+
90
+ losses = []
91
+ for batch_index, batch in enumerate(loader):
92
+ if batch_index >= config.eval_batches:
93
+ break
94
+ batch = {key: value.to(device) for key, value in batch.items()}
95
+ with autocast_context(device, config.precision):
96
+ loss = model(**batch)["loss"]
97
+ losses.append(loss.detach().float().item())
98
+
99
+ mean_loss = float(sum(losses) / max(1, len(losses)))
100
+ perplexity = math.exp(min(mean_loss, 20))
101
+ model.train()
102
+ return mean_loss, perplexity
103
+
104
+
105
+ def maybe_load_weights(
106
+ model: SLLMForCausalLM,
107
+ optimizer: torch.optim.Optimizer,
108
+ config: TrainConfig,
109
+ device: torch.device,
110
+ logger,
111
+ ) -> int:
112
+ step = 0
113
+ checkpoint_path = config.resume_from or config.init_from
114
+ if checkpoint_path is None:
115
+ return step
116
+
117
+ payload = load_checkpoint(checkpoint_path, map_location=device)
118
+ model.load_state_dict(payload["model"])
119
+ if config.resume_from and payload.get("optimizer") is not None:
120
+ optimizer.load_state_dict(payload["optimizer"])
121
+ step = int(payload.get("step", 0))
122
+ logger.info("Resumed training | step=%s checkpoint=%s", step, checkpoint_path)
123
+ else:
124
+ logger.info("Loaded model weights | checkpoint=%s", checkpoint_path)
125
+ return step
126
+
127
+
128
+ def save_run_config(output_dir: Path, model_config: ModelConfig, train_config: TrainConfig) -> None:
129
+ save_json(
130
+ output_dir / "run_config.json",
131
+ {
132
+ "model_config": model_config.to_dict(),
133
+ "train_config": train_config.to_dict(),
134
+ },
135
+ )
136
+
137
+
138
+ def main() -> None:
139
+ args = build_parser().parse_args()
140
+ model_config = ModelConfig.from_dict(load_json(args.model_config))
141
+ train_config = TrainConfig.from_dict(load_json(args.train_config))
142
+ if args.max_steps is not None:
143
+ train_config.max_steps = args.max_steps
144
+
145
+ set_seed(train_config.seed)
146
+ device = get_device()
147
+ maybe_enable_tf32(device)
148
+ runtime_precision, precision_warning = resolve_runtime_precision(device, train_config.precision)
149
+ train_config.precision = runtime_precision
150
+
151
+ output_dir = ensure_dir(train_config.output_dir)
152
+ checkpoint_dir = ensure_dir(train_config.checkpoint_dir)
153
+ logger, log_path = setup_logger("sllm.train_pretrain", output_dir, "train_pretrain")
154
+ metrics_path = Path(output_dir) / "logs" / f"{log_path.stem}.jsonl"
155
+ logger.info("Pretraining started")
156
+ logger.info("Log file: %s", log_path)
157
+ logger.info("Metrics JSONL: %s", metrics_path)
158
+ logger.info("Arguments | model_config=%s train_config=%s max_steps_override=%s", args.model_config, args.train_config, args.max_steps)
159
+ if precision_warning is not None:
160
+ logger.warning(precision_warning)
161
+ logger.info("Model config | %s", model_config.to_dict())
162
+ logger.info("Train config | %s", train_config.to_dict())
163
+ append_jsonl(
164
+ metrics_path,
165
+ {
166
+ "event": "run_started",
167
+ "timestamp": iso_timestamp(),
168
+ "log_path": str(log_path),
169
+ "metrics_path": str(metrics_path),
170
+ "model_config": model_config.to_dict(),
171
+ "train_config": train_config.to_dict(),
172
+ "args": {
173
+ "model_config": args.model_config,
174
+ "train_config": args.train_config,
175
+ "max_steps_override": args.max_steps,
176
+ },
177
+ },
178
+ )
179
+ save_run_config(output_dir, model_config, train_config)
180
+
181
+ dataset = RandomTokenDataset(
182
+ data_dir=train_config.train_dir,
183
+ split="train",
184
+ seq_len=train_config.seq_len,
185
+ seed=train_config.seed,
186
+ )
187
+ loader = DataLoader(
188
+ dataset,
189
+ batch_size=train_config.micro_batch_size,
190
+ num_workers=train_config.num_workers,
191
+ pin_memory=device.type == "cuda",
192
+ )
193
+ data_iter = iter(loader)
194
+
195
+ model = SLLMForCausalLM(model_config).to(device)
196
+ if train_config.compile_model and hasattr(torch, "compile"):
197
+ model = torch.compile(model) # type: ignore[assignment]
198
+
199
+ optimizer = build_optimizer(model, train_config, device)
200
+ scaler = torch.amp.GradScaler(
201
+ "cuda",
202
+ enabled=device.type == "cuda" and train_config.precision.lower() == "fp16",
203
+ )
204
+ start_step = maybe_load_weights(model, optimizer, train_config, device, logger)
205
+ if start_step > 0:
206
+ append_jsonl(
207
+ metrics_path,
208
+ {
209
+ "event": "resumed",
210
+ "timestamp": iso_timestamp(),
211
+ "step": start_step,
212
+ "checkpoint": train_config.resume_from,
213
+ },
214
+ )
215
+ model.train()
216
+
217
+ tokens_step = tokens_per_step(
218
+ train_config.micro_batch_size,
219
+ train_config.grad_accum_steps,
220
+ train_config.seq_len,
221
+ )
222
+
223
+ logger.info("Device summary | device=%s precision=%s compile_model=%s", device, train_config.precision, train_config.compile_model)
224
+ logger.info("Model summary | parameters=%s", format_number(model_parameter_count(model)))
225
+ logger.info(
226
+ "Batch summary | seq_len=%s micro_batch_size=%s grad_accum_steps=%s tokens_per_step=%s",
227
+ train_config.seq_len,
228
+ train_config.micro_batch_size,
229
+ train_config.grad_accum_steps,
230
+ f"{tokens_step:,}",
231
+ )
232
+ logger.info("Dataset summary | train_dir=%s val_dir=%s num_train_shards=%s", train_config.train_dir, train_config.val_dir, len(dataset.shards))
233
+ append_jsonl(
234
+ metrics_path,
235
+ {
236
+ "event": "runtime_summary",
237
+ "timestamp": iso_timestamp(),
238
+ "device": str(device),
239
+ "precision": train_config.precision,
240
+ "compile_model": train_config.compile_model,
241
+ "parameters": model_parameter_count(model),
242
+ "seq_len": train_config.seq_len,
243
+ "micro_batch_size": train_config.micro_batch_size,
244
+ "grad_accum_steps": train_config.grad_accum_steps,
245
+ "tokens_per_step": tokens_step,
246
+ "num_train_shards": len(dataset.shards),
247
+ "train_dir": train_config.train_dir,
248
+ "val_dir": train_config.val_dir,
249
+ },
250
+ )
251
+
252
+ running_loss = 0.0
253
+ log_start_time = time.perf_counter()
254
+ last_grad_norm = float("nan")
255
+
256
+ for step in range(start_step, train_config.max_steps):
257
+ lr = cosine_lr(
258
+ step=step,
259
+ warmup_steps=train_config.warmup_steps,
260
+ max_steps=train_config.max_steps,
261
+ max_lr=train_config.learning_rate,
262
+ min_lr=train_config.min_lr,
263
+ )
264
+ set_optimizer_lr(optimizer, lr)
265
+ optimizer.zero_grad(set_to_none=True)
266
+
267
+ step_loss = 0.0
268
+ for micro_step in range(train_config.grad_accum_steps):
269
+ batch = next(data_iter)
270
+ batch = {key: value.to(device, non_blocking=device.type == "cuda") for key, value in batch.items()}
271
+
272
+ with autocast_context(device, train_config.precision):
273
+ loss = model(**batch)["loss"] / train_config.grad_accum_steps
274
+
275
+ step_loss += loss.detach().float().item()
276
+ if scaler.is_enabled():
277
+ scaler.scale(loss).backward()
278
+ else:
279
+ loss.backward()
280
+
281
+ if train_config.grad_clip is not None and train_config.grad_clip > 0:
282
+ if scaler.is_enabled():
283
+ scaler.unscale_(optimizer)
284
+ grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.grad_clip)
285
+ last_grad_norm = float(grad_norm)
286
+
287
+ if scaler.is_enabled():
288
+ scaler.step(optimizer)
289
+ scaler.update()
290
+ else:
291
+ optimizer.step()
292
+
293
+ running_loss += step_loss
294
+
295
+ if (step + 1) % train_config.log_interval == 0:
296
+ elapsed = time.perf_counter() - log_start_time
297
+ avg_loss = running_loss / train_config.log_interval
298
+ tok_per_sec = (tokens_step * train_config.log_interval) / max(elapsed, 1e-6)
299
+ memory = cuda_memory_snapshot(device)
300
+ memory_suffix = ""
301
+ if memory:
302
+ memory_suffix = (
303
+ f" mem_alloc_gb={memory['allocated_gb']:.2f}"
304
+ f" mem_reserved_gb={memory['reserved_gb']:.2f}"
305
+ f" max_mem_alloc_gb={memory['max_allocated_gb']:.2f}"
306
+ f" max_mem_reserved_gb={memory['max_reserved_gb']:.2f}"
307
+ )
308
+ logger.info(
309
+ "Train step | step=%s loss=%.4f lr=%.6f tok_per_sec=%s grad_norm=%.4f tokens_seen=%s%s",
310
+ step + 1,
311
+ avg_loss,
312
+ lr,
313
+ f"{tok_per_sec:,.0f}",
314
+ last_grad_norm,
315
+ format_number((step + 1) * tokens_step),
316
+ memory_suffix,
317
+ )
318
+ append_jsonl(
319
+ metrics_path,
320
+ {
321
+ "event": "train",
322
+ "timestamp": iso_timestamp(),
323
+ "step": step + 1,
324
+ "loss": avg_loss,
325
+ "lr": lr,
326
+ "tok_per_sec": tok_per_sec,
327
+ "grad_norm": last_grad_norm,
328
+ "tokens_seen": (step + 1) * tokens_step,
329
+ "elapsed_sec": elapsed,
330
+ "seq_len": train_config.seq_len,
331
+ "micro_batch_size": train_config.micro_batch_size,
332
+ "grad_accum_steps": train_config.grad_accum_steps,
333
+ **memory,
334
+ },
335
+ )
336
+ running_loss = 0.0
337
+ log_start_time = time.perf_counter()
338
+
339
+ if (step + 1) % train_config.eval_interval == 0:
340
+ val_loss, perplexity = evaluate(model, train_config, device)
341
+ logger.info("Eval step | step=%s val_loss=%.4f perplexity=%.2f", step + 1, val_loss, perplexity)
342
+ append_jsonl(
343
+ metrics_path,
344
+ {
345
+ "event": "eval",
346
+ "timestamp": iso_timestamp(),
347
+ "step": step + 1,
348
+ "val_loss": val_loss,
349
+ "perplexity": perplexity,
350
+ "eval_batches": train_config.eval_batches,
351
+ },
352
+ )
353
+
354
+ if (step + 1) % train_config.save_interval == 0 or (step + 1) == train_config.max_steps:
355
+ step_checkpoint_path = checkpoint_dir / f"step_{step + 1:07d}.pt"
356
+ last_checkpoint_path = checkpoint_dir / "last.pt"
357
+ save_checkpoint(
358
+ step_checkpoint_path,
359
+ model=model,
360
+ optimizer=optimizer,
361
+ step=step + 1,
362
+ model_config=model_config.to_dict(),
363
+ train_config=train_config.to_dict(),
364
+ extra_state={"tokens_seen": (step + 1) * tokens_step},
365
+ )
366
+ save_checkpoint(
367
+ last_checkpoint_path,
368
+ model=model,
369
+ optimizer=optimizer,
370
+ step=step + 1,
371
+ model_config=model_config.to_dict(),
372
+ train_config=train_config.to_dict(),
373
+ extra_state={"tokens_seen": (step + 1) * tokens_step},
374
+ )
375
+ logger.info(
376
+ "Checkpoint saved | step=%s step_checkpoint=%s last_checkpoint=%s",
377
+ step + 1,
378
+ step_checkpoint_path,
379
+ last_checkpoint_path,
380
+ )
381
+ append_jsonl(
382
+ metrics_path,
383
+ {
384
+ "event": "checkpoint",
385
+ "timestamp": iso_timestamp(),
386
+ "step": step + 1,
387
+ "step_checkpoint": str(step_checkpoint_path),
388
+ "last_checkpoint": str(last_checkpoint_path),
389
+ "tokens_seen": (step + 1) * tokens_step,
390
+ },
391
+ )
392
+
393
+ append_jsonl(
394
+ metrics_path,
395
+ {
396
+ "event": "run_finished",
397
+ "timestamp": iso_timestamp(),
398
+ "final_step": train_config.max_steps,
399
+ "tokens_seen": train_config.max_steps * tokens_step,
400
+ },
401
+ )
402
+
403
+
404
+ if __name__ == "__main__":
405
+ main()
scripts/train_sft.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import math
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+
12
+ ROOT = Path(__file__).resolve().parents[1]
13
+ sys.path.append(str(ROOT / "src"))
14
+
15
+ from sllm.checkpoint import load_checkpoint, save_checkpoint
16
+ from sllm.config import ModelConfig, SFTConfig, load_json, save_json
17
+ from sllm.data import FixedSFTDataset
18
+ from sllm.model import SLLMForCausalLM
19
+ from sllm.utils import (
20
+ append_jsonl,
21
+ autocast_context,
22
+ cosine_lr,
23
+ cuda_memory_snapshot,
24
+ ensure_dir,
25
+ format_number,
26
+ get_device,
27
+ iso_timestamp,
28
+ maybe_enable_tf32,
29
+ model_parameter_count,
30
+ resolve_runtime_precision,
31
+ set_optimizer_lr,
32
+ set_seed,
33
+ setup_logger,
34
+ timestamp,
35
+ tokens_per_step,
36
+ )
37
+
38
+
39
+ def build_parser() -> argparse.ArgumentParser:
40
+ parser = argparse.ArgumentParser(description="Run supervised fine-tuning for the sLLM.")
41
+ parser.add_argument("--model-config", required=True, help="Path to model JSON config.")
42
+ parser.add_argument("--train-config", required=True, help="Path to SFT JSON config.")
43
+ parser.add_argument("--max-steps", type=int, default=None, help="Optional debug override.")
44
+ return parser
45
+
46
+
47
+ def build_optimizer(model: torch.nn.Module, config: SFTConfig, device: torch.device):
48
+ decay_params = []
49
+ no_decay_params = []
50
+ for name, parameter in model.named_parameters():
51
+ if not parameter.requires_grad:
52
+ continue
53
+ if parameter.ndim <= 1 or name.endswith("bias"):
54
+ no_decay_params.append(parameter)
55
+ else:
56
+ decay_params.append(parameter)
57
+ return torch.optim.AdamW(
58
+ [
59
+ {"params": decay_params, "weight_decay": config.weight_decay},
60
+ {"params": no_decay_params, "weight_decay": 0.0},
61
+ ],
62
+ lr=config.learning_rate,
63
+ betas=(config.beta1, config.beta2),
64
+ fused=device.type == "cuda",
65
+ )
66
+
67
+
68
+ @torch.no_grad()
69
+ def evaluate(model: SLLMForCausalLM, loader: DataLoader, device: torch.device, precision: str, max_batches: int):
70
+ model.eval()
71
+ losses = []
72
+ for batch_index, batch in enumerate(loader):
73
+ if batch_index >= max_batches:
74
+ break
75
+ batch = {key: value.to(device) for key, value in batch.items()}
76
+ with autocast_context(device, precision):
77
+ loss = model(**batch)["loss"]
78
+ losses.append(loss.detach().float().item())
79
+ model.train()
80
+ mean_loss = float(sum(losses) / max(1, len(losses)))
81
+ return mean_loss, math.exp(min(mean_loss, 20))
82
+
83
+
84
+ def save_run_config(output_dir: Path, model_config: ModelConfig, train_config: SFTConfig) -> None:
85
+ save_json(
86
+ output_dir / "run_config.json",
87
+ {
88
+ "model_config": model_config.to_dict(),
89
+ "train_config": train_config.to_dict(),
90
+ },
91
+ )
92
+
93
+
94
+ def main() -> None:
95
+ args = build_parser().parse_args()
96
+ model_config = ModelConfig.from_dict(load_json(args.model_config))
97
+ train_config = SFTConfig.from_dict(load_json(args.train_config))
98
+ if args.max_steps is not None:
99
+ train_config.max_steps = args.max_steps
100
+
101
+ set_seed(train_config.seed)
102
+ device = get_device()
103
+ maybe_enable_tf32(device)
104
+ runtime_precision, precision_warning = resolve_runtime_precision(device, train_config.precision)
105
+ train_config.precision = runtime_precision
106
+
107
+ output_dir = ensure_dir(train_config.output_dir)
108
+ checkpoint_dir = ensure_dir(train_config.checkpoint_dir)
109
+ logger, log_path = setup_logger("sllm.train_sft", output_dir, "train_sft")
110
+ metrics_path = Path(output_dir) / "logs" / f"{log_path.stem}.jsonl"
111
+ logger.info("SFT training started")
112
+ logger.info("Log file: %s", log_path)
113
+ logger.info("Metrics JSONL: %s", metrics_path)
114
+ logger.info("Arguments | model_config=%s train_config=%s max_steps_override=%s", args.model_config, args.train_config, args.max_steps)
115
+ if precision_warning is not None:
116
+ logger.warning(precision_warning)
117
+ logger.info("Model config | %s", model_config.to_dict())
118
+ logger.info("SFT config | %s", train_config.to_dict())
119
+ append_jsonl(
120
+ metrics_path,
121
+ {
122
+ "event": "run_started",
123
+ "timestamp": iso_timestamp(),
124
+ "log_path": str(log_path),
125
+ "metrics_path": str(metrics_path),
126
+ "model_config": model_config.to_dict(),
127
+ "train_config": train_config.to_dict(),
128
+ "args": {
129
+ "model_config": args.model_config,
130
+ "train_config": args.train_config,
131
+ "max_steps_override": args.max_steps,
132
+ },
133
+ },
134
+ )
135
+ save_run_config(output_dir, model_config, train_config)
136
+
137
+ train_dataset = FixedSFTDataset(train_config.dataset_path, split="train")
138
+ val_dataset = FixedSFTDataset(train_config.dataset_path, split="val")
139
+ train_loader = DataLoader(
140
+ train_dataset,
141
+ batch_size=train_config.micro_batch_size,
142
+ shuffle=True,
143
+ num_workers=train_config.num_workers,
144
+ pin_memory=device.type == "cuda",
145
+ )
146
+ val_loader = DataLoader(
147
+ val_dataset,
148
+ batch_size=train_config.micro_batch_size,
149
+ shuffle=False,
150
+ num_workers=0,
151
+ pin_memory=device.type == "cuda",
152
+ )
153
+
154
+ model = SLLMForCausalLM(model_config).to(device)
155
+ if train_config.compile_model and hasattr(torch, "compile"):
156
+ model = torch.compile(model) # type: ignore[assignment]
157
+
158
+ optimizer = build_optimizer(model, train_config, device)
159
+ scaler = torch.amp.GradScaler(
160
+ "cuda",
161
+ enabled=device.type == "cuda" and train_config.precision.lower() == "fp16",
162
+ )
163
+
164
+ start_step = 0
165
+ checkpoint_path = train_config.resume_from or train_config.init_from
166
+ if checkpoint_path:
167
+ payload = load_checkpoint(checkpoint_path, map_location=device)
168
+ model.load_state_dict(payload["model"])
169
+ if train_config.resume_from and payload.get("optimizer") is not None:
170
+ optimizer.load_state_dict(payload["optimizer"])
171
+ start_step = int(payload.get("step", 0))
172
+ logger.info("Resumed SFT | step=%s checkpoint=%s", start_step, checkpoint_path)
173
+ append_jsonl(
174
+ metrics_path,
175
+ {
176
+ "event": "resumed",
177
+ "timestamp": iso_timestamp(),
178
+ "step": start_step,
179
+ "checkpoint": checkpoint_path,
180
+ },
181
+ )
182
+ else:
183
+ logger.info("Loaded initialization weights | checkpoint=%s", checkpoint_path)
184
+ append_jsonl(
185
+ metrics_path,
186
+ {
187
+ "event": "initialized_from_checkpoint",
188
+ "timestamp": iso_timestamp(),
189
+ "checkpoint": checkpoint_path,
190
+ },
191
+ )
192
+
193
+ model.train()
194
+ tokens_step = tokens_per_step(
195
+ train_config.micro_batch_size,
196
+ train_config.grad_accum_steps,
197
+ train_config.seq_len,
198
+ )
199
+ logger.info("Device summary | device=%s precision=%s compile_model=%s", device, train_config.precision, train_config.compile_model)
200
+ logger.info("Model summary | parameters=%s", format_number(model_parameter_count(model)))
201
+ logger.info(
202
+ "Batch summary | seq_len=%s micro_batch_size=%s grad_accum_steps=%s tokens_per_step=%s",
203
+ train_config.seq_len,
204
+ train_config.micro_batch_size,
205
+ train_config.grad_accum_steps,
206
+ f"{tokens_step:,}",
207
+ )
208
+ logger.info(
209
+ "Dataset summary | dataset_path=%s train_examples=%s val_examples=%s",
210
+ train_config.dataset_path,
211
+ len(train_dataset),
212
+ len(val_dataset),
213
+ )
214
+ append_jsonl(
215
+ metrics_path,
216
+ {
217
+ "event": "runtime_summary",
218
+ "timestamp": iso_timestamp(),
219
+ "device": str(device),
220
+ "precision": train_config.precision,
221
+ "compile_model": train_config.compile_model,
222
+ "parameters": model_parameter_count(model),
223
+ "seq_len": train_config.seq_len,
224
+ "micro_batch_size": train_config.micro_batch_size,
225
+ "grad_accum_steps": train_config.grad_accum_steps,
226
+ "tokens_per_step": tokens_step,
227
+ "dataset_path": train_config.dataset_path,
228
+ "train_examples": len(train_dataset),
229
+ "val_examples": len(val_dataset),
230
+ },
231
+ )
232
+ running_loss = 0.0
233
+ log_start_time = time.perf_counter()
234
+ train_iterator = iter(train_loader)
235
+ last_grad_norm = float("nan")
236
+
237
+ for step in range(start_step, train_config.max_steps):
238
+ lr = cosine_lr(
239
+ step=step,
240
+ warmup_steps=train_config.warmup_steps,
241
+ max_steps=train_config.max_steps,
242
+ max_lr=train_config.learning_rate,
243
+ min_lr=train_config.min_lr,
244
+ )
245
+ set_optimizer_lr(optimizer, lr)
246
+ optimizer.zero_grad(set_to_none=True)
247
+
248
+ step_loss = 0.0
249
+ for _ in range(train_config.grad_accum_steps):
250
+ try:
251
+ batch = next(train_iterator)
252
+ except StopIteration:
253
+ train_iterator = iter(train_loader)
254
+ batch = next(train_iterator)
255
+
256
+ batch = {key: value.to(device, non_blocking=device.type == "cuda") for key, value in batch.items()}
257
+ with autocast_context(device, train_config.precision):
258
+ loss = model(**batch)["loss"] / train_config.grad_accum_steps
259
+ step_loss += loss.detach().float().item()
260
+ if scaler.is_enabled():
261
+ scaler.scale(loss).backward()
262
+ else:
263
+ loss.backward()
264
+
265
+ if train_config.grad_clip and train_config.grad_clip > 0:
266
+ if scaler.is_enabled():
267
+ scaler.unscale_(optimizer)
268
+ grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.grad_clip)
269
+ last_grad_norm = float(grad_norm)
270
+
271
+ if scaler.is_enabled():
272
+ scaler.step(optimizer)
273
+ scaler.update()
274
+ else:
275
+ optimizer.step()
276
+
277
+ running_loss += step_loss
278
+
279
+ if (step + 1) % train_config.log_interval == 0:
280
+ elapsed = time.perf_counter() - log_start_time
281
+ avg_loss = running_loss / train_config.log_interval
282
+ tok_per_sec = (tokens_step * train_config.log_interval) / max(elapsed, 1e-6)
283
+ memory = cuda_memory_snapshot(device)
284
+ memory_suffix = ""
285
+ if memory:
286
+ memory_suffix = (
287
+ f" mem_alloc_gb={memory['allocated_gb']:.2f}"
288
+ f" mem_reserved_gb={memory['reserved_gb']:.2f}"
289
+ f" max_mem_alloc_gb={memory['max_allocated_gb']:.2f}"
290
+ f" max_mem_reserved_gb={memory['max_reserved_gb']:.2f}"
291
+ )
292
+ logger.info(
293
+ "Train step | step=%s loss=%.4f lr=%.6f tok_per_sec=%s grad_norm=%.4f%s",
294
+ step + 1,
295
+ avg_loss,
296
+ lr,
297
+ f"{tok_per_sec:,.0f}",
298
+ last_grad_norm,
299
+ memory_suffix,
300
+ )
301
+ append_jsonl(
302
+ metrics_path,
303
+ {
304
+ "event": "train",
305
+ "timestamp": iso_timestamp(),
306
+ "step": step + 1,
307
+ "loss": avg_loss,
308
+ "lr": lr,
309
+ "tok_per_sec": tok_per_sec,
310
+ "grad_norm": last_grad_norm,
311
+ "tokens_seen": (step + 1) * tokens_step,
312
+ "elapsed_sec": elapsed,
313
+ "seq_len": train_config.seq_len,
314
+ "micro_batch_size": train_config.micro_batch_size,
315
+ "grad_accum_steps": train_config.grad_accum_steps,
316
+ **memory,
317
+ },
318
+ )
319
+ running_loss = 0.0
320
+ log_start_time = time.perf_counter()
321
+
322
+ if (step + 1) % train_config.eval_interval == 0:
323
+ val_loss, val_ppl = evaluate(
324
+ model=model,
325
+ loader=val_loader,
326
+ device=device,
327
+ precision=train_config.precision,
328
+ max_batches=train_config.eval_batches,
329
+ )
330
+ logger.info("Eval step | step=%s val_loss=%.4f perplexity=%.2f", step + 1, val_loss, val_ppl)
331
+ append_jsonl(
332
+ metrics_path,
333
+ {
334
+ "event": "eval",
335
+ "timestamp": iso_timestamp(),
336
+ "step": step + 1,
337
+ "val_loss": val_loss,
338
+ "perplexity": val_ppl,
339
+ "eval_batches": train_config.eval_batches,
340
+ },
341
+ )
342
+
343
+ if (step + 1) % train_config.save_interval == 0 or (step + 1) == train_config.max_steps:
344
+ step_checkpoint_path = checkpoint_dir / f"step_{step + 1:07d}.pt"
345
+ last_checkpoint_path = checkpoint_dir / "last.pt"
346
+ save_checkpoint(
347
+ step_checkpoint_path,
348
+ model=model,
349
+ optimizer=optimizer,
350
+ step=step + 1,
351
+ model_config=model_config.to_dict(),
352
+ train_config=train_config.to_dict(),
353
+ extra_state={"tokens_seen": (step + 1) * tokens_step},
354
+ )
355
+ save_checkpoint(
356
+ last_checkpoint_path,
357
+ model=model,
358
+ optimizer=optimizer,
359
+ step=step + 1,
360
+ model_config=model_config.to_dict(),
361
+ train_config=train_config.to_dict(),
362
+ extra_state={"tokens_seen": (step + 1) * tokens_step},
363
+ )
364
+ logger.info(
365
+ "Checkpoint saved | step=%s step_checkpoint=%s last_checkpoint=%s",
366
+ step + 1,
367
+ step_checkpoint_path,
368
+ last_checkpoint_path,
369
+ )
370
+ append_jsonl(
371
+ metrics_path,
372
+ {
373
+ "event": "checkpoint",
374
+ "timestamp": iso_timestamp(),
375
+ "step": step + 1,
376
+ "step_checkpoint": str(step_checkpoint_path),
377
+ "last_checkpoint": str(last_checkpoint_path),
378
+ "tokens_seen": (step + 1) * tokens_step,
379
+ },
380
+ )
381
+
382
+ append_jsonl(
383
+ metrics_path,
384
+ {
385
+ "event": "run_finished",
386
+ "timestamp": iso_timestamp(),
387
+ "final_step": train_config.max_steps,
388
+ "tokens_seen": train_config.max_steps * tokens_step,
389
+ },
390
+ )
391
+
392
+
393
+ if __name__ == "__main__":
394
+ main()
scripts/train_tokenizer.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Iterator
8
+
9
+ from datasets import load_dataset
10
+ from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
11
+
12
+ ROOT = Path(__file__).resolve().parents[1]
13
+ sys.path.append(str(ROOT / "src"))
14
+
15
+ from sllm.config import DataMixConfig, load_json, save_json
16
+ from sllm.utils import setup_logger
17
+
18
+
19
+ def build_parser() -> argparse.ArgumentParser:
20
+ parser = argparse.ArgumentParser(description="Train a BPE tokenizer for the sLLM pipeline.")
21
+ parser.add_argument("--data-config", required=True, help="Path to data mixture JSON config.")
22
+ parser.add_argument("--output-dir", required=True, help="Directory where tokenizer files will be stored.")
23
+ parser.add_argument("--vocab-size", type=int, default=49_152, help="Target tokenizer vocabulary size.")
24
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for dataset shuffling.")
25
+ return parser
26
+
27
+
28
+ def iter_source_texts(source, seed: int, limit: int) -> Iterator[str]:
29
+ dataset = load_dataset(
30
+ path=source.path,
31
+ name=source.config_name,
32
+ data_dir=source.data_dir,
33
+ split=source.split,
34
+ revision=source.revision,
35
+ streaming=source.streaming,
36
+ )
37
+ if source.streaming:
38
+ dataset = dataset.shuffle(seed=seed, buffer_size=source.shuffle_buffer)
39
+
40
+ yielded = 0
41
+ for row in dataset:
42
+ text = row.get(source.text_field or "", None)
43
+ if not isinstance(text, str):
44
+ continue
45
+ text = text.strip()
46
+ if not text:
47
+ continue
48
+ yield text
49
+ yielded += 1
50
+ if yielded >= limit:
51
+ return
52
+
53
+
54
+ def mixed_iterator(config: DataMixConfig, seed: int, logger) -> Iterator[str]:
55
+ weight_map = config.normalized_weights()
56
+ total_docs = config.tokenizer_sample_documents
57
+ per_source = {
58
+ source.name: max(1, int(total_docs * weight_map[source.name]))
59
+ for source in config.sources
60
+ }
61
+
62
+ for index, source in enumerate(config.sources):
63
+ limit = source.sample_documents or per_source[source.name]
64
+ logger.info(
65
+ "Tokenizer source start | name=%s path=%s data_dir=%s split=%s text_field=%s limit_docs=%s streaming=%s",
66
+ source.name,
67
+ source.path,
68
+ source.data_dir,
69
+ source.split,
70
+ source.text_field,
71
+ f"{limit:,}",
72
+ source.streaming,
73
+ )
74
+ yield from iter_source_texts(source, seed + index, limit)
75
+
76
+
77
+ def main() -> None:
78
+ args = build_parser().parse_args()
79
+ data_config = DataMixConfig.from_dict(load_json(args.data_config))
80
+ output_dir = Path(args.output_dir)
81
+ output_dir.mkdir(parents=True, exist_ok=True)
82
+ logger, log_path = setup_logger("sllm.train_tokenizer", output_dir, "train_tokenizer")
83
+ logger.info("Tokenizer training started")
84
+ logger.info("Log file: %s", log_path)
85
+ logger.info("Arguments | data_config=%s output_dir=%s vocab_size=%s seed=%s", args.data_config, args.output_dir, args.vocab_size, args.seed)
86
+ logger.info("Tokenizer config | sample_documents=%s min_frequency=%s special_tokens=%s num_sources=%s", f"{data_config.tokenizer_sample_documents:,}", data_config.tokenizer_min_frequency, data_config.tokenizer_special_tokens, len(data_config.sources))
87
+
88
+ tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
89
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
90
+ tokenizer.decoder = decoders.ByteLevel()
91
+ trainer = trainers.BpeTrainer(
92
+ vocab_size=args.vocab_size,
93
+ min_frequency=data_config.tokenizer_min_frequency,
94
+ special_tokens=data_config.tokenizer_special_tokens,
95
+ show_progress=True,
96
+ )
97
+ tokenizer.train_from_iterator(mixed_iterator(data_config, args.seed, logger), trainer=trainer)
98
+
99
+ bos_id = tokenizer.token_to_id("<bos>")
100
+ eos_id = tokenizer.token_to_id("<eos>")
101
+ pad_id = tokenizer.token_to_id("<pad>")
102
+ if bos_id is None or eos_id is None or pad_id is None:
103
+ raise RuntimeError("Tokenizer special tokens were not created correctly.")
104
+
105
+ tokenizer.post_processor = processors.TemplateProcessing(
106
+ single="<bos> $A <eos>",
107
+ pair="<bos> $A <eos> $B:1 <eos>:1",
108
+ special_tokens=[
109
+ ("<bos>", bos_id),
110
+ ("<eos>", eos_id),
111
+ ],
112
+ )
113
+
114
+ tokenizer_path = output_dir / "tokenizer.json"
115
+ tokenizer.save(str(tokenizer_path))
116
+
117
+ metadata = {
118
+ "vocab_size": tokenizer.get_vocab_size(),
119
+ "special_tokens": {
120
+ "pad_token": "<pad>",
121
+ "bos_token": "<bos>",
122
+ "eos_token": "<eos>",
123
+ "unk_token": "<unk>",
124
+ "pad_token_id": pad_id,
125
+ "bos_token_id": bos_id,
126
+ "eos_token_id": eos_id,
127
+ "unk_token_id": tokenizer.token_to_id("<unk>"),
128
+ },
129
+ "data_config": data_config.to_dict(),
130
+ }
131
+ save_json(output_dir / "tokenizer_meta.json", metadata)
132
+
133
+ with (output_dir / "tokenizer_summary.json").open("w", encoding="utf-8") as handle:
134
+ json.dump(metadata, handle, ensure_ascii=False, indent=2)
135
+
136
+ logger.info("Tokenizer saved | path=%s", tokenizer_path)
137
+ logger.info(
138
+ "Tokenizer summary | vocab_size=%s pad_id=%s bos_id=%s eos_id=%s unk_id=%s",
139
+ tokenizer.get_vocab_size(),
140
+ pad_id,
141
+ bos_id,
142
+ eos_id,
143
+ tokenizer.token_to_id("<unk>"),
144
+ )
145
+ logger.info("Tokenizer metadata saved | path=%s", output_dir / "tokenizer_meta.json")
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
src/.DS_Store ADDED
Binary file (6.15 kB). View file