Add files using upload-large-folder tool
Browse files- configs/.ipynb_checkpoints/pretrain_5090_stage1-checkpoint.json +27 -0
- configs/.ipynb_checkpoints/pretrain_5090_stage2_anneal-checkpoint.json +27 -0
- configs/data_mix_10b.json +56 -0
- configs/model_70m.json +17 -0
- configs/pretrain_5090_stage1.json +27 -0
- configs/pretrain_5090_stage2_anneal.json +27 -0
- configs/pretrain_mps_dryrun.json +27 -0
- configs/sft_5090.json +26 -0
- configs/sft_data_smoltalk.json +12 -0
- data/.DS_Store +0 -0
- data/README.md +3 -0
- data/pretokenized/dataset_summary.json +198 -0
- data/pretokenized/logs/prepare_pretrain_data_20260313_091113.log +0 -0
- data/pretokenized/train/train_manifest.json +502 -0
- data/pretokenized/val/val_manifest.json +7 -0
- data/tokenizer/.DS_Store +0 -0
- data/tokenizer/logs/train_tokenizer_20260312_114030.log +11 -0
- data/tokenizer/tokenizer.json +0 -0
- data/tokenizer/tokenizer_meta.json +80 -0
- data/tokenizer/tokenizer_summary.json +80 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl +2 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log +10 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl +27 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log +14 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl +27 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log +34 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl +27 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log +34 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl +13 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log +21 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl +61 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log +69 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl +13 -0
- outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log +21 -0
- outputs/pretrain_mps_dryrun/run_config.json +46 -0
- outputs/pretrain_stage1/.ipynb_checkpoints/run_config-checkpoint.json +46 -0
- outputs/pretrain_stage1/logs/.ipynb_checkpoints/train_pretrain_20260313_152202-checkpoint.log +82 -0
- outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.jsonl +0 -0
- outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.log +0 -0
- outputs/pretrain_stage1/run_config.json +46 -0
- outputs/pretrain_stage2/run_config.json +46 -0
- scripts/.DS_Store +0 -0
- scripts/eval_perplexity.py +79 -0
- scripts/generate.py +81 -0
- scripts/prepare_pretrain_data.py +318 -0
- scripts/prepare_sft_data.py +221 -0
- scripts/train_pretrain.py +405 -0
- scripts/train_sft.py +394 -0
- scripts/train_tokenizer.py +149 -0
- src/.DS_Store +0 -0
configs/.ipynb_checkpoints/pretrain_5090_stage1-checkpoint.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seed": 42,
|
| 3 |
+
"train_dir": "data/pretokenized/train",
|
| 4 |
+
"val_dir": "data/pretokenized/val",
|
| 5 |
+
"output_dir": "outputs/pretrain_stage1",
|
| 6 |
+
"checkpoint_dir": "checkpoints/pretrain_stage1",
|
| 7 |
+
"init_from": null,
|
| 8 |
+
"resume_from": null,
|
| 9 |
+
"seq_len": 2048,
|
| 10 |
+
"micro_batch_size": 8,
|
| 11 |
+
"grad_accum_steps": 32,
|
| 12 |
+
"max_steps": 20000,
|
| 13 |
+
"warmup_steps": 2000,
|
| 14 |
+
"learning_rate": 0.003,
|
| 15 |
+
"min_lr": 0.0003,
|
| 16 |
+
"weight_decay": 0.1,
|
| 17 |
+
"beta1": 0.9,
|
| 18 |
+
"beta2": 0.95,
|
| 19 |
+
"grad_clip": 1.0,
|
| 20 |
+
"precision": "bf16",
|
| 21 |
+
"num_workers": 0,
|
| 22 |
+
"log_interval": 10,
|
| 23 |
+
"eval_interval": 250,
|
| 24 |
+
"eval_batches": 50,
|
| 25 |
+
"save_interval": 100,
|
| 26 |
+
"compile_model": false
|
| 27 |
+
}
|
configs/.ipynb_checkpoints/pretrain_5090_stage2_anneal-checkpoint.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seed": 42,
|
| 3 |
+
"train_dir": "data/pretokenized/train",
|
| 4 |
+
"val_dir": "data/pretokenized/val",
|
| 5 |
+
"output_dir": "outputs/pretrain_stage2",
|
| 6 |
+
"checkpoint_dir": "checkpoints/pretrain_stage2",
|
| 7 |
+
"init_from": "checkpoints/pretrain_stage1/last.pt",
|
| 8 |
+
"resume_from": null,
|
| 9 |
+
"seq_len": 8192,
|
| 10 |
+
"micro_batch_size": 2,
|
| 11 |
+
"grad_accum_steps": 16,
|
| 12 |
+
"max_steps": 1000,
|
| 13 |
+
"warmup_steps": 100,
|
| 14 |
+
"learning_rate": 0.001,
|
| 15 |
+
"min_lr": 0.0001,
|
| 16 |
+
"weight_decay": 0.1,
|
| 17 |
+
"beta1": 0.9,
|
| 18 |
+
"beta2": 0.95,
|
| 19 |
+
"grad_clip": 1.0,
|
| 20 |
+
"precision": "bf16",
|
| 21 |
+
"num_workers": 0,
|
| 22 |
+
"log_interval": 5,
|
| 23 |
+
"eval_interval": 100,
|
| 24 |
+
"eval_batches": 20,
|
| 25 |
+
"save_interval": 50,
|
| 26 |
+
"compile_model": false
|
| 27 |
+
}
|
configs/data_mix_10b.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tokenizer_sample_documents": 2000000,
|
| 3 |
+
"tokenizer_min_frequency": 2,
|
| 4 |
+
"tokenizer_special_tokens": [
|
| 5 |
+
"<pad>",
|
| 6 |
+
"<bos>",
|
| 7 |
+
"<eos>",
|
| 8 |
+
"<unk>"
|
| 9 |
+
],
|
| 10 |
+
"train_tokens": 10000000000,
|
| 11 |
+
"val_tokens": 20000000,
|
| 12 |
+
"shard_size_tokens": 100000000,
|
| 13 |
+
"sources": [
|
| 14 |
+
{
|
| 15 |
+
"name": "fineweb_edu",
|
| 16 |
+
"path": "HuggingFaceFW/fineweb-edu",
|
| 17 |
+
"config_name": "sample-10BT",
|
| 18 |
+
"split": "train",
|
| 19 |
+
"text_field": "text",
|
| 20 |
+
"weight": 0.6,
|
| 21 |
+
"streaming": true,
|
| 22 |
+
"shuffle_buffer": 10000
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"name": "cosmopedia_v2",
|
| 26 |
+
"path": "HuggingFaceTB/smollm-corpus",
|
| 27 |
+
"config_name": "cosmopedia-v2",
|
| 28 |
+
"split": "train",
|
| 29 |
+
"text_field": "text",
|
| 30 |
+
"weight": 0.2,
|
| 31 |
+
"streaming": true,
|
| 32 |
+
"shuffle_buffer": 10000
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"name": "the_stack_python",
|
| 36 |
+
"path": "bigcode/the-stack-dedup",
|
| 37 |
+
"config_name": null,
|
| 38 |
+
"data_dir": "data/python",
|
| 39 |
+
"split": "train",
|
| 40 |
+
"text_field": "content",
|
| 41 |
+
"weight": 0.1,
|
| 42 |
+
"streaming": true,
|
| 43 |
+
"shuffle_buffer": 2000
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"name": "finemath",
|
| 47 |
+
"path": "HuggingFaceTB/finemath",
|
| 48 |
+
"config_name": "finemath-4plus",
|
| 49 |
+
"split": "train",
|
| 50 |
+
"text_field": "text",
|
| 51 |
+
"weight": 0.1,
|
| 52 |
+
"streaming": true,
|
| 53 |
+
"shuffle_buffer": 5000
|
| 54 |
+
}
|
| 55 |
+
]
|
| 56 |
+
}
|
configs/model_70m.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"vocab_size": 49152,
|
| 3 |
+
"max_seq_len": 8192,
|
| 4 |
+
"d_model": 384,
|
| 5 |
+
"n_layers": 32,
|
| 6 |
+
"n_heads": 6,
|
| 7 |
+
"ffn_hidden_dim": 1024,
|
| 8 |
+
"rope_theta": 10000.0,
|
| 9 |
+
"rms_norm_eps": 1e-05,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"dropout": 0.0,
|
| 12 |
+
"tie_word_embeddings": true,
|
| 13 |
+
"bias": false,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"bos_token_id": 1,
|
| 16 |
+
"eos_token_id": 2
|
| 17 |
+
}
|
configs/pretrain_5090_stage1.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seed": 42,
|
| 3 |
+
"train_dir": "data/pretokenized/train",
|
| 4 |
+
"val_dir": "data/pretokenized/val",
|
| 5 |
+
"output_dir": "outputs/pretrain_stage1",
|
| 6 |
+
"checkpoint_dir": "checkpoints/pretrain_stage1",
|
| 7 |
+
"init_from": null,
|
| 8 |
+
"resume_from": null,
|
| 9 |
+
"seq_len": 2048,
|
| 10 |
+
"micro_batch_size": 8,
|
| 11 |
+
"grad_accum_steps": 32,
|
| 12 |
+
"max_steps": 20000,
|
| 13 |
+
"warmup_steps": 2000,
|
| 14 |
+
"learning_rate": 0.003,
|
| 15 |
+
"min_lr": 0.0003,
|
| 16 |
+
"weight_decay": 0.1,
|
| 17 |
+
"beta1": 0.9,
|
| 18 |
+
"beta2": 0.95,
|
| 19 |
+
"grad_clip": 1.0,
|
| 20 |
+
"precision": "bf16",
|
| 21 |
+
"num_workers": 0,
|
| 22 |
+
"log_interval": 10,
|
| 23 |
+
"eval_interval": 250,
|
| 24 |
+
"eval_batches": 50,
|
| 25 |
+
"save_interval": 100,
|
| 26 |
+
"compile_model": false
|
| 27 |
+
}
|
configs/pretrain_5090_stage2_anneal.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seed": 42,
|
| 3 |
+
"train_dir": "data/pretokenized/train",
|
| 4 |
+
"val_dir": "data/pretokenized/val",
|
| 5 |
+
"output_dir": "outputs/pretrain_stage2",
|
| 6 |
+
"checkpoint_dir": "checkpoints/pretrain_stage2",
|
| 7 |
+
"init_from": "checkpoints/pretrain_stage1/last.pt",
|
| 8 |
+
"resume_from": null,
|
| 9 |
+
"seq_len": 8192,
|
| 10 |
+
"micro_batch_size": 2,
|
| 11 |
+
"grad_accum_steps": 16,
|
| 12 |
+
"max_steps": 1000,
|
| 13 |
+
"warmup_steps": 100,
|
| 14 |
+
"learning_rate": 0.001,
|
| 15 |
+
"min_lr": 0.0001,
|
| 16 |
+
"weight_decay": 0.1,
|
| 17 |
+
"beta1": 0.9,
|
| 18 |
+
"beta2": 0.95,
|
| 19 |
+
"grad_clip": 1.0,
|
| 20 |
+
"precision": "bf16",
|
| 21 |
+
"num_workers": 0,
|
| 22 |
+
"log_interval": 5,
|
| 23 |
+
"eval_interval": 100,
|
| 24 |
+
"eval_batches": 20,
|
| 25 |
+
"save_interval": 50,
|
| 26 |
+
"compile_model": false
|
| 27 |
+
}
|
configs/pretrain_mps_dryrun.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seed": 42,
|
| 3 |
+
"train_dir": "data/pretokenized/train",
|
| 4 |
+
"val_dir": "data/pretokenized/val",
|
| 5 |
+
"output_dir": "outputs/pretrain_mps_dryrun",
|
| 6 |
+
"checkpoint_dir": "checkpoints/pretrain_mps_dryrun",
|
| 7 |
+
"init_from": null,
|
| 8 |
+
"resume_from": null,
|
| 9 |
+
"seq_len": 512,
|
| 10 |
+
"micro_batch_size": 1,
|
| 11 |
+
"grad_accum_steps": 4,
|
| 12 |
+
"max_steps": 500,
|
| 13 |
+
"warmup_steps": 50,
|
| 14 |
+
"learning_rate": 0.001,
|
| 15 |
+
"min_lr": 0.0001,
|
| 16 |
+
"weight_decay": 0.1,
|
| 17 |
+
"beta1": 0.9,
|
| 18 |
+
"beta2": 0.95,
|
| 19 |
+
"grad_clip": 1.0,
|
| 20 |
+
"precision": "fp32",
|
| 21 |
+
"num_workers": 0,
|
| 22 |
+
"log_interval": 1,
|
| 23 |
+
"eval_interval": 10,
|
| 24 |
+
"eval_batches": 2,
|
| 25 |
+
"save_interval": 10,
|
| 26 |
+
"compile_model": false
|
| 27 |
+
}
|
configs/sft_5090.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seed": 42,
|
| 3 |
+
"dataset_path": "data/sft/processed",
|
| 4 |
+
"output_dir": "outputs/sft",
|
| 5 |
+
"checkpoint_dir": "checkpoints/sft",
|
| 6 |
+
"init_from": "checkpoints/pretrain_stage2/last.pt",
|
| 7 |
+
"resume_from": null,
|
| 8 |
+
"seq_len": 2048,
|
| 9 |
+
"micro_batch_size": 8,
|
| 10 |
+
"grad_accum_steps": 16,
|
| 11 |
+
"max_steps": 5000,
|
| 12 |
+
"warmup_steps": 200,
|
| 13 |
+
"learning_rate": 0.0005,
|
| 14 |
+
"min_lr": 5e-05,
|
| 15 |
+
"weight_decay": 0.01,
|
| 16 |
+
"beta1": 0.9,
|
| 17 |
+
"beta2": 0.95,
|
| 18 |
+
"grad_clip": 1.0,
|
| 19 |
+
"precision": "bf16",
|
| 20 |
+
"num_workers": 0,
|
| 21 |
+
"log_interval": 10,
|
| 22 |
+
"eval_interval": 100,
|
| 23 |
+
"eval_batches": 50,
|
| 24 |
+
"save_interval": 200,
|
| 25 |
+
"compile_model": false
|
| 26 |
+
}
|
configs/sft_data_smoltalk.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"path": "HuggingFaceTB/smoltalk",
|
| 3 |
+
"config_name": null,
|
| 4 |
+
"split": "train",
|
| 5 |
+
"revision": null,
|
| 6 |
+
"streaming": false,
|
| 7 |
+
"shuffle": true,
|
| 8 |
+
"format": "messages",
|
| 9 |
+
"messages_field": "messages",
|
| 10 |
+
"val_examples": 2000,
|
| 11 |
+
"max_train_examples": 200000
|
| 12 |
+
}
|
data/.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
data/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
data/pretokenized/dataset_summary.json
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tokenizer": {
|
| 3 |
+
"vocab_size": 49152,
|
| 4 |
+
"special_tokens": {
|
| 5 |
+
"pad_token": "<pad>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eos_token": "<eos>",
|
| 8 |
+
"unk_token": "<unk>",
|
| 9 |
+
"pad_token_id": 0,
|
| 10 |
+
"bos_token_id": 1,
|
| 11 |
+
"eos_token_id": 2,
|
| 12 |
+
"unk_token_id": 3
|
| 13 |
+
},
|
| 14 |
+
"data_config": {
|
| 15 |
+
"sources": [
|
| 16 |
+
{
|
| 17 |
+
"name": "fineweb_edu",
|
| 18 |
+
"path": "HuggingFaceFW/fineweb-edu",
|
| 19 |
+
"split": "train",
|
| 20 |
+
"weight": 0.6,
|
| 21 |
+
"text_field": "text",
|
| 22 |
+
"config_name": "sample-10BT",
|
| 23 |
+
"data_dir": null,
|
| 24 |
+
"revision": null,
|
| 25 |
+
"streaming": true,
|
| 26 |
+
"shuffle_buffer": 10000,
|
| 27 |
+
"sample_documents": null
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"name": "cosmopedia_v2",
|
| 31 |
+
"path": "HuggingFaceTB/smollm-corpus",
|
| 32 |
+
"split": "train",
|
| 33 |
+
"weight": 0.2,
|
| 34 |
+
"text_field": "text",
|
| 35 |
+
"config_name": "cosmopedia-v2",
|
| 36 |
+
"data_dir": null,
|
| 37 |
+
"revision": null,
|
| 38 |
+
"streaming": true,
|
| 39 |
+
"shuffle_buffer": 10000,
|
| 40 |
+
"sample_documents": null
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "the_stack_python",
|
| 44 |
+
"path": "bigcode/the-stack-dedup",
|
| 45 |
+
"split": "train",
|
| 46 |
+
"weight": 0.1,
|
| 47 |
+
"text_field": "content",
|
| 48 |
+
"config_name": null,
|
| 49 |
+
"data_dir": "data/python",
|
| 50 |
+
"revision": null,
|
| 51 |
+
"streaming": true,
|
| 52 |
+
"shuffle_buffer": 2000,
|
| 53 |
+
"sample_documents": null
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"name": "finemath",
|
| 57 |
+
"path": "HuggingFaceTB/finemath",
|
| 58 |
+
"split": "train",
|
| 59 |
+
"weight": 0.1,
|
| 60 |
+
"text_field": "text",
|
| 61 |
+
"config_name": "finemath-4plus",
|
| 62 |
+
"data_dir": null,
|
| 63 |
+
"revision": null,
|
| 64 |
+
"streaming": true,
|
| 65 |
+
"shuffle_buffer": 5000,
|
| 66 |
+
"sample_documents": null
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"tokenizer_sample_documents": 2000000,
|
| 70 |
+
"tokenizer_min_frequency": 2,
|
| 71 |
+
"tokenizer_special_tokens": [
|
| 72 |
+
"<pad>",
|
| 73 |
+
"<bos>",
|
| 74 |
+
"<eos>",
|
| 75 |
+
"<unk>"
|
| 76 |
+
],
|
| 77 |
+
"train_tokens": 10000000000,
|
| 78 |
+
"val_tokens": 20000000,
|
| 79 |
+
"shard_size_tokens": 100000000
|
| 80 |
+
}
|
| 81 |
+
},
|
| 82 |
+
"data_config": {
|
| 83 |
+
"sources": [
|
| 84 |
+
{
|
| 85 |
+
"name": "fineweb_edu",
|
| 86 |
+
"path": "HuggingFaceFW/fineweb-edu",
|
| 87 |
+
"split": "train",
|
| 88 |
+
"weight": 0.6,
|
| 89 |
+
"text_field": "text",
|
| 90 |
+
"config_name": "sample-10BT",
|
| 91 |
+
"data_dir": null,
|
| 92 |
+
"revision": null,
|
| 93 |
+
"streaming": true,
|
| 94 |
+
"shuffle_buffer": 10000,
|
| 95 |
+
"sample_documents": null
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"name": "cosmopedia_v2",
|
| 99 |
+
"path": "HuggingFaceTB/smollm-corpus",
|
| 100 |
+
"split": "train",
|
| 101 |
+
"weight": 0.2,
|
| 102 |
+
"text_field": "text",
|
| 103 |
+
"config_name": "cosmopedia-v2",
|
| 104 |
+
"data_dir": null,
|
| 105 |
+
"revision": null,
|
| 106 |
+
"streaming": true,
|
| 107 |
+
"shuffle_buffer": 10000,
|
| 108 |
+
"sample_documents": null
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"name": "the_stack_python",
|
| 112 |
+
"path": "bigcode/the-stack-dedup",
|
| 113 |
+
"split": "train",
|
| 114 |
+
"weight": 0.1,
|
| 115 |
+
"text_field": "content",
|
| 116 |
+
"config_name": null,
|
| 117 |
+
"data_dir": "data/python",
|
| 118 |
+
"revision": null,
|
| 119 |
+
"streaming": true,
|
| 120 |
+
"shuffle_buffer": 2000,
|
| 121 |
+
"sample_documents": null
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"name": "finemath",
|
| 125 |
+
"path": "HuggingFaceTB/finemath",
|
| 126 |
+
"split": "train",
|
| 127 |
+
"weight": 0.1,
|
| 128 |
+
"text_field": "text",
|
| 129 |
+
"config_name": "finemath-4plus",
|
| 130 |
+
"data_dir": null,
|
| 131 |
+
"revision": null,
|
| 132 |
+
"streaming": true,
|
| 133 |
+
"shuffle_buffer": 5000,
|
| 134 |
+
"sample_documents": null
|
| 135 |
+
}
|
| 136 |
+
],
|
| 137 |
+
"tokenizer_sample_documents": 2000000,
|
| 138 |
+
"tokenizer_min_frequency": 2,
|
| 139 |
+
"tokenizer_special_tokens": [
|
| 140 |
+
"<pad>",
|
| 141 |
+
"<bos>",
|
| 142 |
+
"<eos>",
|
| 143 |
+
"<unk>"
|
| 144 |
+
],
|
| 145 |
+
"train_tokens": 10000000000,
|
| 146 |
+
"val_tokens": 20000000,
|
| 147 |
+
"shard_size_tokens": 100000000
|
| 148 |
+
},
|
| 149 |
+
"mixing_strategy": "global_interleaving_weighted_progress_balancing",
|
| 150 |
+
"train_target_tokens": 10000000000,
|
| 151 |
+
"val_target_tokens": 20000000,
|
| 152 |
+
"train_tokens_written": 10000000000,
|
| 153 |
+
"val_tokens_written": 20000000,
|
| 154 |
+
"train_shards": 100,
|
| 155 |
+
"val_shards": 1,
|
| 156 |
+
"sources": {
|
| 157 |
+
"fineweb_edu": {
|
| 158 |
+
"path": "HuggingFaceFW/fineweb-edu",
|
| 159 |
+
"data_dir": null,
|
| 160 |
+
"split": "train",
|
| 161 |
+
"train_target_tokens": 6000000000,
|
| 162 |
+
"val_target_tokens": 12000000,
|
| 163 |
+
"train_tokens_written": 6000000000,
|
| 164 |
+
"val_tokens_written": 12000000,
|
| 165 |
+
"documents_used": 5922817
|
| 166 |
+
},
|
| 167 |
+
"cosmopedia_v2": {
|
| 168 |
+
"path": "HuggingFaceTB/smollm-corpus",
|
| 169 |
+
"data_dir": null,
|
| 170 |
+
"split": "train",
|
| 171 |
+
"train_target_tokens": 2000000000,
|
| 172 |
+
"val_target_tokens": 4000000,
|
| 173 |
+
"train_tokens_written": 2000000000,
|
| 174 |
+
"val_tokens_written": 4000000,
|
| 175 |
+
"documents_used": 2792704
|
| 176 |
+
},
|
| 177 |
+
"the_stack_python": {
|
| 178 |
+
"path": "bigcode/the-stack-dedup",
|
| 179 |
+
"data_dir": "data/python",
|
| 180 |
+
"split": "train",
|
| 181 |
+
"train_target_tokens": 1000000000,
|
| 182 |
+
"val_target_tokens": 2000000,
|
| 183 |
+
"train_tokens_written": 1000000000,
|
| 184 |
+
"val_tokens_written": 2000000,
|
| 185 |
+
"documents_used": 684540
|
| 186 |
+
},
|
| 187 |
+
"finemath": {
|
| 188 |
+
"path": "HuggingFaceTB/finemath",
|
| 189 |
+
"data_dir": null,
|
| 190 |
+
"split": "train",
|
| 191 |
+
"train_target_tokens": 1000000000,
|
| 192 |
+
"val_target_tokens": 2000000,
|
| 193 |
+
"train_tokens_written": 1000000000,
|
| 194 |
+
"val_tokens_written": 2000000,
|
| 195 |
+
"documents_used": 692367
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
}
|
data/pretokenized/logs/prepare_pretrain_data_20260313_091113.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/pretokenized/train/train_manifest.json
ADDED
|
@@ -0,0 +1,502 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"path": "train_00000.bin",
|
| 4 |
+
"num_tokens": 100000000,
|
| 5 |
+
"dtype": "uint16"
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"path": "train_00001.bin",
|
| 9 |
+
"num_tokens": 100000000,
|
| 10 |
+
"dtype": "uint16"
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"path": "train_00002.bin",
|
| 14 |
+
"num_tokens": 100000000,
|
| 15 |
+
"dtype": "uint16"
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"path": "train_00003.bin",
|
| 19 |
+
"num_tokens": 100000000,
|
| 20 |
+
"dtype": "uint16"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"path": "train_00004.bin",
|
| 24 |
+
"num_tokens": 100000000,
|
| 25 |
+
"dtype": "uint16"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"path": "train_00005.bin",
|
| 29 |
+
"num_tokens": 100000000,
|
| 30 |
+
"dtype": "uint16"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"path": "train_00006.bin",
|
| 34 |
+
"num_tokens": 100000000,
|
| 35 |
+
"dtype": "uint16"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"path": "train_00007.bin",
|
| 39 |
+
"num_tokens": 100000000,
|
| 40 |
+
"dtype": "uint16"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"path": "train_00008.bin",
|
| 44 |
+
"num_tokens": 100000000,
|
| 45 |
+
"dtype": "uint16"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"path": "train_00009.bin",
|
| 49 |
+
"num_tokens": 100000000,
|
| 50 |
+
"dtype": "uint16"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"path": "train_00010.bin",
|
| 54 |
+
"num_tokens": 100000000,
|
| 55 |
+
"dtype": "uint16"
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"path": "train_00011.bin",
|
| 59 |
+
"num_tokens": 100000000,
|
| 60 |
+
"dtype": "uint16"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"path": "train_00012.bin",
|
| 64 |
+
"num_tokens": 100000000,
|
| 65 |
+
"dtype": "uint16"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"path": "train_00013.bin",
|
| 69 |
+
"num_tokens": 100000000,
|
| 70 |
+
"dtype": "uint16"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"path": "train_00014.bin",
|
| 74 |
+
"num_tokens": 100000000,
|
| 75 |
+
"dtype": "uint16"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"path": "train_00015.bin",
|
| 79 |
+
"num_tokens": 100000000,
|
| 80 |
+
"dtype": "uint16"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"path": "train_00016.bin",
|
| 84 |
+
"num_tokens": 100000000,
|
| 85 |
+
"dtype": "uint16"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"path": "train_00017.bin",
|
| 89 |
+
"num_tokens": 100000000,
|
| 90 |
+
"dtype": "uint16"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"path": "train_00018.bin",
|
| 94 |
+
"num_tokens": 100000000,
|
| 95 |
+
"dtype": "uint16"
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"path": "train_00019.bin",
|
| 99 |
+
"num_tokens": 100000000,
|
| 100 |
+
"dtype": "uint16"
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"path": "train_00020.bin",
|
| 104 |
+
"num_tokens": 100000000,
|
| 105 |
+
"dtype": "uint16"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"path": "train_00021.bin",
|
| 109 |
+
"num_tokens": 100000000,
|
| 110 |
+
"dtype": "uint16"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"path": "train_00022.bin",
|
| 114 |
+
"num_tokens": 100000000,
|
| 115 |
+
"dtype": "uint16"
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"path": "train_00023.bin",
|
| 119 |
+
"num_tokens": 100000000,
|
| 120 |
+
"dtype": "uint16"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"path": "train_00024.bin",
|
| 124 |
+
"num_tokens": 100000000,
|
| 125 |
+
"dtype": "uint16"
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"path": "train_00025.bin",
|
| 129 |
+
"num_tokens": 100000000,
|
| 130 |
+
"dtype": "uint16"
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"path": "train_00026.bin",
|
| 134 |
+
"num_tokens": 100000000,
|
| 135 |
+
"dtype": "uint16"
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"path": "train_00027.bin",
|
| 139 |
+
"num_tokens": 100000000,
|
| 140 |
+
"dtype": "uint16"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"path": "train_00028.bin",
|
| 144 |
+
"num_tokens": 100000000,
|
| 145 |
+
"dtype": "uint16"
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"path": "train_00029.bin",
|
| 149 |
+
"num_tokens": 100000000,
|
| 150 |
+
"dtype": "uint16"
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"path": "train_00030.bin",
|
| 154 |
+
"num_tokens": 100000000,
|
| 155 |
+
"dtype": "uint16"
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"path": "train_00031.bin",
|
| 159 |
+
"num_tokens": 100000000,
|
| 160 |
+
"dtype": "uint16"
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"path": "train_00032.bin",
|
| 164 |
+
"num_tokens": 100000000,
|
| 165 |
+
"dtype": "uint16"
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"path": "train_00033.bin",
|
| 169 |
+
"num_tokens": 100000000,
|
| 170 |
+
"dtype": "uint16"
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"path": "train_00034.bin",
|
| 174 |
+
"num_tokens": 100000000,
|
| 175 |
+
"dtype": "uint16"
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"path": "train_00035.bin",
|
| 179 |
+
"num_tokens": 100000000,
|
| 180 |
+
"dtype": "uint16"
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"path": "train_00036.bin",
|
| 184 |
+
"num_tokens": 100000000,
|
| 185 |
+
"dtype": "uint16"
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"path": "train_00037.bin",
|
| 189 |
+
"num_tokens": 100000000,
|
| 190 |
+
"dtype": "uint16"
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"path": "train_00038.bin",
|
| 194 |
+
"num_tokens": 100000000,
|
| 195 |
+
"dtype": "uint16"
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"path": "train_00039.bin",
|
| 199 |
+
"num_tokens": 100000000,
|
| 200 |
+
"dtype": "uint16"
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"path": "train_00040.bin",
|
| 204 |
+
"num_tokens": 100000000,
|
| 205 |
+
"dtype": "uint16"
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"path": "train_00041.bin",
|
| 209 |
+
"num_tokens": 100000000,
|
| 210 |
+
"dtype": "uint16"
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"path": "train_00042.bin",
|
| 214 |
+
"num_tokens": 100000000,
|
| 215 |
+
"dtype": "uint16"
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"path": "train_00043.bin",
|
| 219 |
+
"num_tokens": 100000000,
|
| 220 |
+
"dtype": "uint16"
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"path": "train_00044.bin",
|
| 224 |
+
"num_tokens": 100000000,
|
| 225 |
+
"dtype": "uint16"
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"path": "train_00045.bin",
|
| 229 |
+
"num_tokens": 100000000,
|
| 230 |
+
"dtype": "uint16"
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"path": "train_00046.bin",
|
| 234 |
+
"num_tokens": 100000000,
|
| 235 |
+
"dtype": "uint16"
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"path": "train_00047.bin",
|
| 239 |
+
"num_tokens": 100000000,
|
| 240 |
+
"dtype": "uint16"
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"path": "train_00048.bin",
|
| 244 |
+
"num_tokens": 100000000,
|
| 245 |
+
"dtype": "uint16"
|
| 246 |
+
},
|
| 247 |
+
{
|
| 248 |
+
"path": "train_00049.bin",
|
| 249 |
+
"num_tokens": 100000000,
|
| 250 |
+
"dtype": "uint16"
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"path": "train_00050.bin",
|
| 254 |
+
"num_tokens": 100000000,
|
| 255 |
+
"dtype": "uint16"
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"path": "train_00051.bin",
|
| 259 |
+
"num_tokens": 100000000,
|
| 260 |
+
"dtype": "uint16"
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"path": "train_00052.bin",
|
| 264 |
+
"num_tokens": 100000000,
|
| 265 |
+
"dtype": "uint16"
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"path": "train_00053.bin",
|
| 269 |
+
"num_tokens": 100000000,
|
| 270 |
+
"dtype": "uint16"
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"path": "train_00054.bin",
|
| 274 |
+
"num_tokens": 100000000,
|
| 275 |
+
"dtype": "uint16"
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"path": "train_00055.bin",
|
| 279 |
+
"num_tokens": 100000000,
|
| 280 |
+
"dtype": "uint16"
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"path": "train_00056.bin",
|
| 284 |
+
"num_tokens": 100000000,
|
| 285 |
+
"dtype": "uint16"
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"path": "train_00057.bin",
|
| 289 |
+
"num_tokens": 100000000,
|
| 290 |
+
"dtype": "uint16"
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"path": "train_00058.bin",
|
| 294 |
+
"num_tokens": 100000000,
|
| 295 |
+
"dtype": "uint16"
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"path": "train_00059.bin",
|
| 299 |
+
"num_tokens": 100000000,
|
| 300 |
+
"dtype": "uint16"
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"path": "train_00060.bin",
|
| 304 |
+
"num_tokens": 100000000,
|
| 305 |
+
"dtype": "uint16"
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"path": "train_00061.bin",
|
| 309 |
+
"num_tokens": 100000000,
|
| 310 |
+
"dtype": "uint16"
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"path": "train_00062.bin",
|
| 314 |
+
"num_tokens": 100000000,
|
| 315 |
+
"dtype": "uint16"
|
| 316 |
+
},
|
| 317 |
+
{
|
| 318 |
+
"path": "train_00063.bin",
|
| 319 |
+
"num_tokens": 100000000,
|
| 320 |
+
"dtype": "uint16"
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"path": "train_00064.bin",
|
| 324 |
+
"num_tokens": 100000000,
|
| 325 |
+
"dtype": "uint16"
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"path": "train_00065.bin",
|
| 329 |
+
"num_tokens": 100000000,
|
| 330 |
+
"dtype": "uint16"
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"path": "train_00066.bin",
|
| 334 |
+
"num_tokens": 100000000,
|
| 335 |
+
"dtype": "uint16"
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"path": "train_00067.bin",
|
| 339 |
+
"num_tokens": 100000000,
|
| 340 |
+
"dtype": "uint16"
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"path": "train_00068.bin",
|
| 344 |
+
"num_tokens": 100000000,
|
| 345 |
+
"dtype": "uint16"
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"path": "train_00069.bin",
|
| 349 |
+
"num_tokens": 100000000,
|
| 350 |
+
"dtype": "uint16"
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"path": "train_00070.bin",
|
| 354 |
+
"num_tokens": 100000000,
|
| 355 |
+
"dtype": "uint16"
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"path": "train_00071.bin",
|
| 359 |
+
"num_tokens": 100000000,
|
| 360 |
+
"dtype": "uint16"
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"path": "train_00072.bin",
|
| 364 |
+
"num_tokens": 100000000,
|
| 365 |
+
"dtype": "uint16"
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
"path": "train_00073.bin",
|
| 369 |
+
"num_tokens": 100000000,
|
| 370 |
+
"dtype": "uint16"
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"path": "train_00074.bin",
|
| 374 |
+
"num_tokens": 100000000,
|
| 375 |
+
"dtype": "uint16"
|
| 376 |
+
},
|
| 377 |
+
{
|
| 378 |
+
"path": "train_00075.bin",
|
| 379 |
+
"num_tokens": 100000000,
|
| 380 |
+
"dtype": "uint16"
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"path": "train_00076.bin",
|
| 384 |
+
"num_tokens": 100000000,
|
| 385 |
+
"dtype": "uint16"
|
| 386 |
+
},
|
| 387 |
+
{
|
| 388 |
+
"path": "train_00077.bin",
|
| 389 |
+
"num_tokens": 100000000,
|
| 390 |
+
"dtype": "uint16"
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"path": "train_00078.bin",
|
| 394 |
+
"num_tokens": 100000000,
|
| 395 |
+
"dtype": "uint16"
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"path": "train_00079.bin",
|
| 399 |
+
"num_tokens": 100000000,
|
| 400 |
+
"dtype": "uint16"
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"path": "train_00080.bin",
|
| 404 |
+
"num_tokens": 100000000,
|
| 405 |
+
"dtype": "uint16"
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"path": "train_00081.bin",
|
| 409 |
+
"num_tokens": 100000000,
|
| 410 |
+
"dtype": "uint16"
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"path": "train_00082.bin",
|
| 414 |
+
"num_tokens": 100000000,
|
| 415 |
+
"dtype": "uint16"
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"path": "train_00083.bin",
|
| 419 |
+
"num_tokens": 100000000,
|
| 420 |
+
"dtype": "uint16"
|
| 421 |
+
},
|
| 422 |
+
{
|
| 423 |
+
"path": "train_00084.bin",
|
| 424 |
+
"num_tokens": 100000000,
|
| 425 |
+
"dtype": "uint16"
|
| 426 |
+
},
|
| 427 |
+
{
|
| 428 |
+
"path": "train_00085.bin",
|
| 429 |
+
"num_tokens": 100000000,
|
| 430 |
+
"dtype": "uint16"
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"path": "train_00086.bin",
|
| 434 |
+
"num_tokens": 100000000,
|
| 435 |
+
"dtype": "uint16"
|
| 436 |
+
},
|
| 437 |
+
{
|
| 438 |
+
"path": "train_00087.bin",
|
| 439 |
+
"num_tokens": 100000000,
|
| 440 |
+
"dtype": "uint16"
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"path": "train_00088.bin",
|
| 444 |
+
"num_tokens": 100000000,
|
| 445 |
+
"dtype": "uint16"
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"path": "train_00089.bin",
|
| 449 |
+
"num_tokens": 100000000,
|
| 450 |
+
"dtype": "uint16"
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"path": "train_00090.bin",
|
| 454 |
+
"num_tokens": 100000000,
|
| 455 |
+
"dtype": "uint16"
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"path": "train_00091.bin",
|
| 459 |
+
"num_tokens": 100000000,
|
| 460 |
+
"dtype": "uint16"
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"path": "train_00092.bin",
|
| 464 |
+
"num_tokens": 100000000,
|
| 465 |
+
"dtype": "uint16"
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"path": "train_00093.bin",
|
| 469 |
+
"num_tokens": 100000000,
|
| 470 |
+
"dtype": "uint16"
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"path": "train_00094.bin",
|
| 474 |
+
"num_tokens": 100000000,
|
| 475 |
+
"dtype": "uint16"
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"path": "train_00095.bin",
|
| 479 |
+
"num_tokens": 100000000,
|
| 480 |
+
"dtype": "uint16"
|
| 481 |
+
},
|
| 482 |
+
{
|
| 483 |
+
"path": "train_00096.bin",
|
| 484 |
+
"num_tokens": 100000000,
|
| 485 |
+
"dtype": "uint16"
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"path": "train_00097.bin",
|
| 489 |
+
"num_tokens": 100000000,
|
| 490 |
+
"dtype": "uint16"
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"path": "train_00098.bin",
|
| 494 |
+
"num_tokens": 100000000,
|
| 495 |
+
"dtype": "uint16"
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"path": "train_00099.bin",
|
| 499 |
+
"num_tokens": 100000000,
|
| 500 |
+
"dtype": "uint16"
|
| 501 |
+
}
|
| 502 |
+
]
|
data/pretokenized/val/val_manifest.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"path": "val_00000.bin",
|
| 4 |
+
"num_tokens": 20000000,
|
| 5 |
+
"dtype": "uint16"
|
| 6 |
+
}
|
| 7 |
+
]
|
data/tokenizer/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
data/tokenizer/logs/train_tokenizer_20260312_114030.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-12 11:40:30,043 | INFO | Tokenizer training started
|
| 2 |
+
2026-03-12 11:40:30,044 | INFO | Log file: data/tokenizer/logs/train_tokenizer_20260312_114030.log
|
| 3 |
+
2026-03-12 11:40:30,044 | INFO | Arguments | data_config=configs/data_mix_10b.json output_dir=data/tokenizer vocab_size=49152 seed=42
|
| 4 |
+
2026-03-12 11:40:30,044 | INFO | Tokenizer config | sample_documents=2,000,000 min_frequency=2 special_tokens=['<pad>', '<bos>', '<eos>', '<unk>'] num_sources=4
|
| 5 |
+
2026-03-12 11:40:30,044 | INFO | Tokenizer source start | name=fineweb_edu path=HuggingFaceFW/fineweb-edu data_dir=None split=train text_field=text limit_docs=1,200,000 streaming=True
|
| 6 |
+
2026-03-12 11:51:35,669 | INFO | Tokenizer source start | name=cosmopedia_v2 path=HuggingFaceTB/smollm-corpus data_dir=None split=train text_field=text limit_docs=400,000 streaming=True
|
| 7 |
+
2026-03-12 11:55:58,013 | INFO | Tokenizer source start | name=the_stack_python path=bigcode/the-stack-dedup data_dir=data/python split=train text_field=content limit_docs=200,000 streaming=True
|
| 8 |
+
2026-03-12 12:00:03,620 | INFO | Tokenizer source start | name=finemath path=HuggingFaceTB/finemath data_dir=None split=train text_field=text limit_docs=200,000 streaming=True
|
| 9 |
+
2026-03-12 12:08:46,619 | INFO | Tokenizer saved | path=data/tokenizer/tokenizer.json
|
| 10 |
+
2026-03-12 12:08:46,630 | INFO | Tokenizer summary | vocab_size=49152 pad_id=0 bos_id=1 eos_id=2 unk_id=3
|
| 11 |
+
2026-03-12 12:08:46,630 | INFO | Tokenizer metadata saved | path=data/tokenizer/tokenizer_meta.json
|
data/tokenizer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/tokenizer/tokenizer_meta.json
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"vocab_size": 49152,
|
| 3 |
+
"special_tokens": {
|
| 4 |
+
"pad_token": "<pad>",
|
| 5 |
+
"bos_token": "<bos>",
|
| 6 |
+
"eos_token": "<eos>",
|
| 7 |
+
"unk_token": "<unk>",
|
| 8 |
+
"pad_token_id": 0,
|
| 9 |
+
"bos_token_id": 1,
|
| 10 |
+
"eos_token_id": 2,
|
| 11 |
+
"unk_token_id": 3
|
| 12 |
+
},
|
| 13 |
+
"data_config": {
|
| 14 |
+
"sources": [
|
| 15 |
+
{
|
| 16 |
+
"name": "fineweb_edu",
|
| 17 |
+
"path": "HuggingFaceFW/fineweb-edu",
|
| 18 |
+
"split": "train",
|
| 19 |
+
"weight": 0.6,
|
| 20 |
+
"text_field": "text",
|
| 21 |
+
"config_name": "sample-10BT",
|
| 22 |
+
"data_dir": null,
|
| 23 |
+
"revision": null,
|
| 24 |
+
"streaming": true,
|
| 25 |
+
"shuffle_buffer": 10000,
|
| 26 |
+
"sample_documents": null
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"name": "cosmopedia_v2",
|
| 30 |
+
"path": "HuggingFaceTB/smollm-corpus",
|
| 31 |
+
"split": "train",
|
| 32 |
+
"weight": 0.2,
|
| 33 |
+
"text_field": "text",
|
| 34 |
+
"config_name": "cosmopedia-v2",
|
| 35 |
+
"data_dir": null,
|
| 36 |
+
"revision": null,
|
| 37 |
+
"streaming": true,
|
| 38 |
+
"shuffle_buffer": 10000,
|
| 39 |
+
"sample_documents": null
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"name": "the_stack_python",
|
| 43 |
+
"path": "bigcode/the-stack-dedup",
|
| 44 |
+
"split": "train",
|
| 45 |
+
"weight": 0.1,
|
| 46 |
+
"text_field": "content",
|
| 47 |
+
"config_name": null,
|
| 48 |
+
"data_dir": "data/python",
|
| 49 |
+
"revision": null,
|
| 50 |
+
"streaming": true,
|
| 51 |
+
"shuffle_buffer": 2000,
|
| 52 |
+
"sample_documents": null
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"name": "finemath",
|
| 56 |
+
"path": "HuggingFaceTB/finemath",
|
| 57 |
+
"split": "train",
|
| 58 |
+
"weight": 0.1,
|
| 59 |
+
"text_field": "text",
|
| 60 |
+
"config_name": "finemath-4plus",
|
| 61 |
+
"data_dir": null,
|
| 62 |
+
"revision": null,
|
| 63 |
+
"streaming": true,
|
| 64 |
+
"shuffle_buffer": 5000,
|
| 65 |
+
"sample_documents": null
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"tokenizer_sample_documents": 2000000,
|
| 69 |
+
"tokenizer_min_frequency": 2,
|
| 70 |
+
"tokenizer_special_tokens": [
|
| 71 |
+
"<pad>",
|
| 72 |
+
"<bos>",
|
| 73 |
+
"<eos>",
|
| 74 |
+
"<unk>"
|
| 75 |
+
],
|
| 76 |
+
"train_tokens": 10000000000,
|
| 77 |
+
"val_tokens": 20000000,
|
| 78 |
+
"shard_size_tokens": 100000000
|
| 79 |
+
}
|
| 80 |
+
}
|
data/tokenizer/tokenizer_summary.json
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"vocab_size": 49152,
|
| 3 |
+
"special_tokens": {
|
| 4 |
+
"pad_token": "<pad>",
|
| 5 |
+
"bos_token": "<bos>",
|
| 6 |
+
"eos_token": "<eos>",
|
| 7 |
+
"unk_token": "<unk>",
|
| 8 |
+
"pad_token_id": 0,
|
| 9 |
+
"bos_token_id": 1,
|
| 10 |
+
"eos_token_id": 2,
|
| 11 |
+
"unk_token_id": 3
|
| 12 |
+
},
|
| 13 |
+
"data_config": {
|
| 14 |
+
"sources": [
|
| 15 |
+
{
|
| 16 |
+
"name": "fineweb_edu",
|
| 17 |
+
"path": "HuggingFaceFW/fineweb-edu",
|
| 18 |
+
"split": "train",
|
| 19 |
+
"weight": 0.6,
|
| 20 |
+
"text_field": "text",
|
| 21 |
+
"config_name": "sample-10BT",
|
| 22 |
+
"data_dir": null,
|
| 23 |
+
"revision": null,
|
| 24 |
+
"streaming": true,
|
| 25 |
+
"shuffle_buffer": 10000,
|
| 26 |
+
"sample_documents": null
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"name": "cosmopedia_v2",
|
| 30 |
+
"path": "HuggingFaceTB/smollm-corpus",
|
| 31 |
+
"split": "train",
|
| 32 |
+
"weight": 0.2,
|
| 33 |
+
"text_field": "text",
|
| 34 |
+
"config_name": "cosmopedia-v2",
|
| 35 |
+
"data_dir": null,
|
| 36 |
+
"revision": null,
|
| 37 |
+
"streaming": true,
|
| 38 |
+
"shuffle_buffer": 10000,
|
| 39 |
+
"sample_documents": null
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"name": "the_stack_python",
|
| 43 |
+
"path": "bigcode/the-stack-dedup",
|
| 44 |
+
"split": "train",
|
| 45 |
+
"weight": 0.1,
|
| 46 |
+
"text_field": "content",
|
| 47 |
+
"config_name": null,
|
| 48 |
+
"data_dir": "data/python",
|
| 49 |
+
"revision": null,
|
| 50 |
+
"streaming": true,
|
| 51 |
+
"shuffle_buffer": 2000,
|
| 52 |
+
"sample_documents": null
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"name": "finemath",
|
| 56 |
+
"path": "HuggingFaceTB/finemath",
|
| 57 |
+
"split": "train",
|
| 58 |
+
"weight": 0.1,
|
| 59 |
+
"text_field": "text",
|
| 60 |
+
"config_name": "finemath-4plus",
|
| 61 |
+
"data_dir": null,
|
| 62 |
+
"revision": null,
|
| 63 |
+
"streaming": true,
|
| 64 |
+
"shuffle_buffer": 5000,
|
| 65 |
+
"sample_documents": null
|
| 66 |
+
}
|
| 67 |
+
],
|
| 68 |
+
"tokenizer_sample_documents": 2000000,
|
| 69 |
+
"tokenizer_min_frequency": 2,
|
| 70 |
+
"tokenizer_special_tokens": [
|
| 71 |
+
"<pad>",
|
| 72 |
+
"<bos>",
|
| 73 |
+
"<eos>",
|
| 74 |
+
"<unk>"
|
| 75 |
+
],
|
| 76 |
+
"train_tokens": 10000000000,
|
| 77 |
+
"val_tokens": 20000000,
|
| 78 |
+
"shard_size_tokens": 100000000
|
| 79 |
+
}
|
| 80 |
+
}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"event": "run_started", "timestamp": "2026-03-13T14:07:30", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
|
| 2 |
+
{"event": "runtime_summary", "timestamp": "2026-03-13T14:07:34", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 14:07:30,831 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 14:07:30,832 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.log
|
| 3 |
+
2026-03-13 14:07:30,832 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140730.jsonl
|
| 4 |
+
2026-03-13 14:07:30,832 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
|
| 5 |
+
2026-03-13 14:07:30,832 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 14:07:30,832 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
|
| 7 |
+
2026-03-13 14:07:34,596 | INFO | Device summary | device=mps precision=fp32 compile_model=False
|
| 8 |
+
2026-03-13 14:07:34,597 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 14:07:34,597 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
|
| 10 |
+
2026-03-13 14:07:34,597 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"event": "run_started", "timestamp": "2026-03-13T14:09:07", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
|
| 2 |
+
{"event": "runtime_summary", "timestamp": "2026-03-13T14:09:10", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
|
| 3 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:12", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 961.8014053409653, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 2.1293377080000937, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 4 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:13", "step": 2, "loss": 10.763139724731445, "lr": 0.0004, "tok_per_sec": 1605.8989070685525, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 1.275298208987806, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 5 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:14", "step": 3, "loss": 10.356749057769775, "lr": 0.0006000000000000001, "tok_per_sec": 2736.6722939747565, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7483541250112467, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 6 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:14", "step": 4, "loss": 10.376826286315918, "lr": 0.0008, "tok_per_sec": 2756.3866613090086, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7430017089936882, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 7 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:15", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2715.4393876891622, "grad_norm": 12.64534854888916, "tokens_seen": 10240, "elapsed_sec": 0.754205750010442, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 8 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:16", "step": 6, "loss": 9.938905477523804, "lr": 0.001, "tok_per_sec": 2725.141290121042, "grad_norm": 1.7282862663269043, "tokens_seen": 12288, "elapsed_sec": 0.7515206669922918, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 9 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:17", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2738.185267024283, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7479406250058673, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 10 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:17", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2751.8910854624123, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7442154999589548, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 11 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:18", "step": 9, "loss": 9.267512798309326, "lr": 0.0009140576474687263, "tok_per_sec": 2708.5891084687337, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.7561132080154493, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 12 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:19", "step": 10, "loss": 8.833673238754272, "lr": 0.0008511087728614862, "tok_per_sec": 2765.4777296002535, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.7405592090217397, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 13 |
+
{"event": "eval", "timestamp": "2026-03-13T14:09:19", "step": 10, "val_loss": 9.096094608306885, "perplexity": 8920.386982370957, "eval_batches": 2}
|
| 14 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:09:33", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
|
| 15 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:34", "step": 11, "loss": 8.795855522155762, "lr": 0.0007750000000000001, "tok_per_sec": 135.3521374189279, "grad_norm": 1.4899625778198242, "tokens_seen": 22528, "elapsed_sec": 15.130902540986426, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 16 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:35", "step": 12, "loss": 8.558577060699463, "lr": 0.0006890576474687264, "tok_per_sec": 2659.660815260197, "grad_norm": 1.5879555940628052, "tokens_seen": 24576, "elapsed_sec": 0.7700230000191368, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 17 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:36", "step": 13, "loss": 8.595118284225464, "lr": 0.0005970378084704442, "tok_per_sec": 2709.453151326185, "grad_norm": 1.3136154413223267, "tokens_seen": 26624, "elapsed_sec": 0.7558720840024762, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 18 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:36", "step": 14, "loss": 8.341074705123901, "lr": 0.000502962191529556, "tok_per_sec": 2569.4064364370934, "grad_norm": 1.2977045774459839, "tokens_seen": 28672, "elapsed_sec": 0.7970712499809451, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 19 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:37", "step": 15, "loss": 8.268006086349487, "lr": 0.0004109423525312737, "tok_per_sec": 2272.894269158833, "grad_norm": 1.197304368019104, "tokens_seen": 30720, "elapsed_sec": 0.9010537919821218, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 20 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:38", "step": 16, "loss": 8.30242395401001, "lr": 0.0003250000000000001, "tok_per_sec": 2636.6892030166064, "grad_norm": 1.1259286403656006, "tokens_seen": 32768, "elapsed_sec": 0.7767316669924185, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 21 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:39", "step": 17, "loss": 8.6144118309021, "lr": 0.00024889122713851394, "tok_per_sec": 2628.5409836443887, "grad_norm": 0.9170812368392944, "tokens_seen": 34816, "elapsed_sec": 0.7791394590167329, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 22 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:40", "step": 18, "loss": 8.439870119094849, "lr": 0.00018594235253127368, "tok_per_sec": 2699.6840414438493, "grad_norm": 1.6393400430679321, "tokens_seen": 36864, "elapsed_sec": 0.758607292023953, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 23 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:40", "step": 19, "loss": 7.915311574935913, "lr": 0.00013890454406082956, "tok_per_sec": 2709.5957985032933, "grad_norm": 1.111694097518921, "tokens_seen": 38912, "elapsed_sec": 0.755832290975377, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 24 |
+
{"event": "train", "timestamp": "2026-03-13T14:09:41", "step": 20, "loss": 7.964773654937744, "lr": 0.00010983357966978745, "tok_per_sec": 2689.7526879403435, "grad_norm": 1.00663423538208, "tokens_seen": 40960, "elapsed_sec": 0.7614082919899374, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 25 |
+
{"event": "eval", "timestamp": "2026-03-13T14:09:41", "step": 20, "val_loss": 8.758275032043457, "perplexity": 6363.125917448135, "eval_batches": 2}
|
| 26 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:09:54", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
|
| 27 |
+
{"event": "run_finished", "timestamp": "2026-03-13T14:09:54", "final_step": 20, "tokens_seen": 40960}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 14:09:07,112 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 14:09:07,112 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.log
|
| 3 |
+
2026-03-13 14:09:07,112 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_140907.jsonl
|
| 4 |
+
2026-03-13 14:09:07,112 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
|
| 5 |
+
2026-03-13 14:09:07,112 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 14:09:07,112 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
|
| 7 |
+
2026-03-13 14:09:10,064 | INFO | Device summary | device=mps precision=fp32 compile_model=False
|
| 8 |
+
2026-03-13 14:09:10,065 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 14:09:10,065 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
|
| 10 |
+
2026-03-13 14:09:10,065 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
| 11 |
+
2026-03-13 14:09:19,703 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.39
|
| 12 |
+
2026-03-13 14:09:33,612 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
| 13 |
+
2026-03-13 14:09:41,833 | INFO | Eval step | step=20 val_loss=8.7583 perplexity=6363.13
|
| 14 |
+
2026-03-13 14:09:54,172 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"event": "run_started", "timestamp": "2026-03-13T14:12:24", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
|
| 2 |
+
{"event": "runtime_summary", "timestamp": "2026-03-13T14:12:27", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
|
| 3 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:28", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 1572.3567196374945, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.3025034169550054, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 4 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:29", "step": 2, "loss": 10.763139724731445, "lr": 0.0004, "tok_per_sec": 1953.4684072997784, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 1.0483916670200415, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 5 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:30", "step": 3, "loss": 10.356749057769775, "lr": 0.0006000000000000001, "tok_per_sec": 2746.91589368826, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7455634170328267, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 6 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:31", "step": 4, "loss": 10.376826524734497, "lr": 0.0008, "tok_per_sec": 2765.8319145818245, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7404643750051036, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 7 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:32", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2769.4683023376706, "grad_norm": 12.645360946655273, "tokens_seen": 10240, "elapsed_sec": 0.7394921249942854, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 8 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:32", "step": 6, "loss": 9.938905715942383, "lr": 0.001, "tok_per_sec": 2767.895009558497, "grad_norm": 1.7282859086990356, "tokens_seen": 12288, "elapsed_sec": 0.7399124579969794, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 9 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:33", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2751.478083715354, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7443272080272436, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 10 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:34", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2761.168994935579, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7417148330132477, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 11 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:34", "step": 9, "loss": 9.267512798309326, "lr": 0.0009140576474687263, "tok_per_sec": 2775.2087673169717, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.737962500017602, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 12 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:35", "step": 10, "loss": 8.833673000335693, "lr": 0.0008511087728614862, "tok_per_sec": 2743.4623249730666, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.74650195898721, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 13 |
+
{"event": "eval", "timestamp": "2026-03-13T14:12:35", "step": 10, "val_loss": 9.096094131469727, "perplexity": 8920.382728799992, "eval_batches": 2}
|
| 14 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:12:53", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
|
| 15 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:55", "step": 11, "loss": 8.79585576057434, "lr": 0.0007750000000000001, "tok_per_sec": 106.21275007147676, "grad_norm": 1.4899623394012451, "tokens_seen": 22528, "elapsed_sec": 19.282054166018497, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 16 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:55", "step": 12, "loss": 8.558577299118042, "lr": 0.0006890576474687264, "tok_per_sec": 2617.334622430896, "grad_norm": 1.5879555940628052, "tokens_seen": 24576, "elapsed_sec": 0.7824754169560038, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 17 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:56", "step": 13, "loss": 8.595118045806885, "lr": 0.0005970378084704442, "tok_per_sec": 2577.6688468954194, "grad_norm": 1.3136155605316162, "tokens_seen": 26624, "elapsed_sec": 0.7945163330296054, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 18 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:57", "step": 14, "loss": 8.341074705123901, "lr": 0.000502962191529556, "tok_per_sec": 2652.6070721156225, "grad_norm": 1.2977045774459839, "tokens_seen": 28672, "elapsed_sec": 0.7720706249820068, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 19 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:58", "step": 15, "loss": 8.268006086349487, "lr": 0.0004109423525312737, "tok_per_sec": 2697.354489927494, "grad_norm": 1.1973044872283936, "tokens_seen": 30720, "elapsed_sec": 0.7592624579556286, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 20 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:58", "step": 16, "loss": 8.302424192428589, "lr": 0.0003250000000000001, "tok_per_sec": 2578.0945986776087, "grad_norm": 1.1259286403656006, "tokens_seen": 32768, "elapsed_sec": 0.794385124987457, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 21 |
+
{"event": "train", "timestamp": "2026-03-13T14:12:59", "step": 17, "loss": 8.61441159248352, "lr": 0.00024889122713851394, "tok_per_sec": 2190.790230266318, "grad_norm": 0.9170812368392944, "tokens_seen": 34816, "elapsed_sec": 0.9348224999848753, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 22 |
+
{"event": "train", "timestamp": "2026-03-13T14:13:00", "step": 18, "loss": 8.439870119094849, "lr": 0.00018594235253127368, "tok_per_sec": 2557.2677193427544, "grad_norm": 1.6393400430679321, "tokens_seen": 36864, "elapsed_sec": 0.8008547499775887, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 23 |
+
{"event": "train", "timestamp": "2026-03-13T14:13:01", "step": 19, "loss": 7.915311813354492, "lr": 0.00013890454406082956, "tok_per_sec": 2489.488983600405, "grad_norm": 1.111694097518921, "tokens_seen": 38912, "elapsed_sec": 0.8226587920216843, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 24 |
+
{"event": "train", "timestamp": "2026-03-13T14:13:02", "step": 20, "loss": 7.964773654937744, "lr": 0.00010983357966978745, "tok_per_sec": 2517.705047649911, "grad_norm": 1.00663423538208, "tokens_seen": 40960, "elapsed_sec": 0.8134392080246471, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 25 |
+
{"event": "eval", "timestamp": "2026-03-13T14:13:02", "step": 20, "val_loss": 8.758275032043457, "perplexity": 6363.125917448135, "eval_batches": 2}
|
| 26 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:13:17", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
|
| 27 |
+
{"event": "run_finished", "timestamp": "2026-03-13T14:13:17", "final_step": 20, "tokens_seen": 40960}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 14:12:24,605 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 14:12:24,605 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.log
|
| 3 |
+
2026-03-13 14:12:24,605 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_141224.jsonl
|
| 4 |
+
2026-03-13 14:12:24,605 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
|
| 5 |
+
2026-03-13 14:12:24,605 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 14:12:24,605 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
|
| 7 |
+
2026-03-13 14:12:27,439 | INFO | Device summary | device=mps precision=fp32 compile_model=False
|
| 8 |
+
2026-03-13 14:12:27,440 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 14:12:27,440 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
|
| 10 |
+
2026-03-13 14:12:27,440 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
| 11 |
+
2026-03-13 14:12:28,743 | INFO | Train step | step=1 loss=10.8489 lr=0.000200 tok_per_sec=1,572 grad_norm=5.5737 tokens_seen=2.05K
|
| 12 |
+
2026-03-13 14:12:29,792 | INFO | Train step | step=2 loss=10.7631 lr=0.000400 tok_per_sec=1,953 grad_norm=8.3225 tokens_seen=4.10K
|
| 13 |
+
2026-03-13 14:12:30,538 | INFO | Train step | step=3 loss=10.3567 lr=0.000600 tok_per_sec=2,747 grad_norm=2.6284 tokens_seen=6.14K
|
| 14 |
+
2026-03-13 14:12:31,280 | INFO | Train step | step=4 loss=10.3768 lr=0.000800 tok_per_sec=2,766 grad_norm=2.2171 tokens_seen=8.19K
|
| 15 |
+
2026-03-13 14:12:32,020 | INFO | Train step | step=5 loss=10.2315 lr=0.001000 tok_per_sec=2,769 grad_norm=12.6454 tokens_seen=10.24K
|
| 16 |
+
2026-03-13 14:12:32,760 | INFO | Train step | step=6 loss=9.9389 lr=0.001000 tok_per_sec=2,768 grad_norm=1.7283 tokens_seen=12.29K
|
| 17 |
+
2026-03-13 14:12:33,505 | INFO | Train step | step=7 loss=9.6686 lr=0.000990 tok_per_sec=2,751 grad_norm=1.9499 tokens_seen=14.34K
|
| 18 |
+
2026-03-13 14:12:34,247 | INFO | Train step | step=8 loss=9.3355 lr=0.000961 tok_per_sec=2,761 grad_norm=1.7211 tokens_seen=16.38K
|
| 19 |
+
2026-03-13 14:12:34,986 | INFO | Train step | step=9 loss=9.2675 lr=0.000914 tok_per_sec=2,775 grad_norm=1.7628 tokens_seen=18.43K
|
| 20 |
+
2026-03-13 14:12:35,733 | INFO | Train step | step=10 loss=8.8337 lr=0.000851 tok_per_sec=2,743 grad_norm=1.7004 tokens_seen=20.48K
|
| 21 |
+
2026-03-13 14:12:35,903 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.38
|
| 22 |
+
2026-03-13 14:12:53,990 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
| 23 |
+
2026-03-13 14:12:55,016 | INFO | Train step | step=11 loss=8.7959 lr=0.000775 tok_per_sec=106 grad_norm=1.4900 tokens_seen=22.53K
|
| 24 |
+
2026-03-13 14:12:55,798 | INFO | Train step | step=12 loss=8.5586 lr=0.000689 tok_per_sec=2,617 grad_norm=1.5880 tokens_seen=24.58K
|
| 25 |
+
2026-03-13 14:12:56,593 | INFO | Train step | step=13 loss=8.5951 lr=0.000597 tok_per_sec=2,578 grad_norm=1.3136 tokens_seen=26.62K
|
| 26 |
+
2026-03-13 14:12:57,366 | INFO | Train step | step=14 loss=8.3411 lr=0.000503 tok_per_sec=2,653 grad_norm=1.2977 tokens_seen=28.67K
|
| 27 |
+
2026-03-13 14:12:58,126 | INFO | Train step | step=15 loss=8.2680 lr=0.000411 tok_per_sec=2,697 grad_norm=1.1973 tokens_seen=30.72K
|
| 28 |
+
2026-03-13 14:12:58,921 | INFO | Train step | step=16 loss=8.3024 lr=0.000325 tok_per_sec=2,578 grad_norm=1.1259 tokens_seen=32.77K
|
| 29 |
+
2026-03-13 14:12:59,857 | INFO | Train step | step=17 loss=8.6144 lr=0.000249 tok_per_sec=2,191 grad_norm=0.9171 tokens_seen=34.82K
|
| 30 |
+
2026-03-13 14:13:00,660 | INFO | Train step | step=18 loss=8.4399 lr=0.000186 tok_per_sec=2,557 grad_norm=1.6393 tokens_seen=36.86K
|
| 31 |
+
2026-03-13 14:13:01,483 | INFO | Train step | step=19 loss=7.9153 lr=0.000139 tok_per_sec=2,489 grad_norm=1.1117 tokens_seen=38.91K
|
| 32 |
+
2026-03-13 14:13:02,297 | INFO | Train step | step=20 loss=7.9648 lr=0.000110 tok_per_sec=2,518 grad_norm=1.0066 tokens_seen=40.96K
|
| 33 |
+
2026-03-13 14:13:02,479 | INFO | Eval step | step=20 val_loss=8.7583 perplexity=6363.13
|
| 34 |
+
2026-03-13 14:13:17,338 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"event": "run_started", "timestamp": "2026-03-13T14:23:31", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
|
| 2 |
+
{"event": "runtime_summary", "timestamp": "2026-03-13T14:23:34", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
|
| 3 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:36", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 1528.4547833159693, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.3399153330246918, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 4 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:37", "step": 2, "loss": 10.763139486312866, "lr": 0.0004, "tok_per_sec": 1823.3959146519999, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 1.1231789999874309, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 5 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:37", "step": 3, "loss": 10.356749296188354, "lr": 0.0006000000000000001, "tok_per_sec": 2760.206551850419, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7419734579743817, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 6 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:38", "step": 4, "loss": 10.376826524734497, "lr": 0.0008, "tok_per_sec": 2758.5098402671138, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7424298329860903, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 7 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:39", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2743.529400521807, "grad_norm": 12.645355224609375, "tokens_seen": 10240, "elapsed_sec": 0.7464837080333382, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 8 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:40", "step": 6, "loss": 9.938905715942383, "lr": 0.001, "tok_per_sec": 2787.348798075642, "grad_norm": 1.7282867431640625, "tokens_seen": 12288, "elapsed_sec": 0.7347483750199899, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 9 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:40", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2768.0064610428144, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7398826660355553, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 10 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:41", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2765.3453249466884, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7405946669750847, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 11 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:42", "step": 9, "loss": 9.267512559890747, "lr": 0.0009140576474687263, "tok_per_sec": 2776.3259680860633, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.7376655419939198, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 12 |
+
{"event": "train", "timestamp": "2026-03-13T14:23:43", "step": 10, "loss": 8.833673000335693, "lr": 0.0008511087728614862, "tok_per_sec": 2775.777209987709, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.7378113750019111, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 13 |
+
{"event": "eval", "timestamp": "2026-03-13T14:23:43", "step": 10, "val_loss": 9.096094131469727, "perplexity": 8920.382728799992, "eval_batches": 2}
|
| 14 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:24:04", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
|
| 15 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:04", "step": 11, "loss": 8.795855522155762, "lr": 0.0007750000000000001, "tok_per_sec": 93.63913751097624, "grad_norm": 1.4899623394012451, "tokens_seen": 22528, "elapsed_sec": 21.871196749969386, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 16 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:05", "step": 12, "loss": 8.558577060699463, "lr": 0.0006890576474687264, "tok_per_sec": 2713.0807773384895, "grad_norm": 1.5879555940628052, "tokens_seen": 24576, "elapsed_sec": 0.7548614169936627, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 17 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:06", "step": 13, "loss": 8.595118045806885, "lr": 0.0005970378084704442, "tok_per_sec": 2623.969926754092, "grad_norm": 1.3136155605316162, "tokens_seen": 26624, "elapsed_sec": 0.7804967500269413, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 18 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:07", "step": 14, "loss": 8.341074705123901, "lr": 0.000502962191529556, "tok_per_sec": 2640.2210539544203, "grad_norm": 1.2977045774459839, "tokens_seen": 28672, "elapsed_sec": 0.7756926250294782, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 19 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:08", "step": 15, "loss": 8.268006086349487, "lr": 0.0004109423525312737, "tok_per_sec": 2716.846221941928, "grad_norm": 1.1973044872283936, "tokens_seen": 30720, "elapsed_sec": 0.7538152080378495, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 20 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:08", "step": 16, "loss": 8.302424192428589, "lr": 0.0003250000000000001, "tok_per_sec": 2732.9290763944346, "grad_norm": 1.1259286403656006, "tokens_seen": 32768, "elapsed_sec": 0.7493791250162758, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 21 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:09", "step": 17, "loss": 8.6144118309021, "lr": 0.00024889122713851394, "tok_per_sec": 2724.595059218572, "grad_norm": 0.9170812368392944, "tokens_seen": 34816, "elapsed_sec": 0.7516713329823688, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 22 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:10", "step": 18, "loss": 8.439870119094849, "lr": 0.00018594235253127368, "tok_per_sec": 2735.215026577713, "grad_norm": 1.6393400430679321, "tokens_seen": 36864, "elapsed_sec": 0.7487528329947963, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 23 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:11", "step": 19, "loss": 7.915311813354492, "lr": 0.00013890454406082956, "tok_per_sec": 2730.9775428733506, "grad_norm": 1.111694097518921, "tokens_seen": 38912, "elapsed_sec": 0.7499146250193007, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 24 |
+
{"event": "train", "timestamp": "2026-03-13T14:24:11", "step": 20, "loss": 7.964773654937744, "lr": 0.00010983357966978745, "tok_per_sec": 2738.180386915235, "grad_norm": 1.00663423538208, "tokens_seen": 40960, "elapsed_sec": 0.7479419580195099, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 25 |
+
{"event": "eval", "timestamp": "2026-03-13T14:24:11", "step": 20, "val_loss": 8.758275032043457, "perplexity": 6363.125917448135, "eval_batches": 2}
|
| 26 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:24:24", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
|
| 27 |
+
{"event": "run_finished", "timestamp": "2026-03-13T14:24:24", "final_step": 20, "tokens_seen": 40960}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 14:23:31,892 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 14:23:31,892 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.log
|
| 3 |
+
2026-03-13 14:23:31,892 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142331.jsonl
|
| 4 |
+
2026-03-13 14:23:31,892 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
|
| 5 |
+
2026-03-13 14:23:31,892 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 14:23:31,892 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
|
| 7 |
+
2026-03-13 14:23:34,726 | INFO | Device summary | device=mps precision=fp32 compile_model=False
|
| 8 |
+
2026-03-13 14:23:34,727 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 14:23:34,727 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
|
| 10 |
+
2026-03-13 14:23:34,727 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
| 11 |
+
2026-03-13 14:23:36,068 | INFO | Train step | step=1 loss=10.8489 lr=0.000200 tok_per_sec=1,528 grad_norm=5.5737 tokens_seen=2.05K
|
| 12 |
+
2026-03-13 14:23:37,192 | INFO | Train step | step=2 loss=10.7631 lr=0.000400 tok_per_sec=1,823 grad_norm=8.3225 tokens_seen=4.10K
|
| 13 |
+
2026-03-13 14:23:37,934 | INFO | Train step | step=3 loss=10.3567 lr=0.000600 tok_per_sec=2,760 grad_norm=2.6284 tokens_seen=6.14K
|
| 14 |
+
2026-03-13 14:23:38,678 | INFO | Train step | step=4 loss=10.3768 lr=0.000800 tok_per_sec=2,759 grad_norm=2.2171 tokens_seen=8.19K
|
| 15 |
+
2026-03-13 14:23:39,425 | INFO | Train step | step=5 loss=10.2315 lr=0.001000 tok_per_sec=2,744 grad_norm=12.6454 tokens_seen=10.24K
|
| 16 |
+
2026-03-13 14:23:40,160 | INFO | Train step | step=6 loss=9.9389 lr=0.001000 tok_per_sec=2,787 grad_norm=1.7283 tokens_seen=12.29K
|
| 17 |
+
2026-03-13 14:23:40,900 | INFO | Train step | step=7 loss=9.6686 lr=0.000990 tok_per_sec=2,768 grad_norm=1.9499 tokens_seen=14.34K
|
| 18 |
+
2026-03-13 14:23:41,641 | INFO | Train step | step=8 loss=9.3355 lr=0.000961 tok_per_sec=2,765 grad_norm=1.7211 tokens_seen=16.38K
|
| 19 |
+
2026-03-13 14:23:42,380 | INFO | Train step | step=9 loss=9.2675 lr=0.000914 tok_per_sec=2,776 grad_norm=1.7628 tokens_seen=18.43K
|
| 20 |
+
2026-03-13 14:23:43,118 | INFO | Train step | step=10 loss=8.8337 lr=0.000851 tok_per_sec=2,776 grad_norm=1.7004 tokens_seen=20.48K
|
| 21 |
+
2026-03-13 14:23:43,296 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.38
|
| 22 |
+
2026-03-13 14:24:04,120 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
| 23 |
+
2026-03-13 14:24:04,990 | INFO | Train step | step=11 loss=8.7959 lr=0.000775 tok_per_sec=94 grad_norm=1.4900 tokens_seen=22.53K
|
| 24 |
+
2026-03-13 14:24:05,745 | INFO | Train step | step=12 loss=8.5586 lr=0.000689 tok_per_sec=2,713 grad_norm=1.5880 tokens_seen=24.58K
|
| 25 |
+
2026-03-13 14:24:06,526 | INFO | Train step | step=13 loss=8.5951 lr=0.000597 tok_per_sec=2,624 grad_norm=1.3136 tokens_seen=26.62K
|
| 26 |
+
2026-03-13 14:24:07,302 | INFO | Train step | step=14 loss=8.3411 lr=0.000503 tok_per_sec=2,640 grad_norm=1.2977 tokens_seen=28.67K
|
| 27 |
+
2026-03-13 14:24:08,057 | INFO | Train step | step=15 loss=8.2680 lr=0.000411 tok_per_sec=2,717 grad_norm=1.1973 tokens_seen=30.72K
|
| 28 |
+
2026-03-13 14:24:08,806 | INFO | Train step | step=16 loss=8.3024 lr=0.000325 tok_per_sec=2,733 grad_norm=1.1259 tokens_seen=32.77K
|
| 29 |
+
2026-03-13 14:24:09,559 | INFO | Train step | step=17 loss=8.6144 lr=0.000249 tok_per_sec=2,725 grad_norm=0.9171 tokens_seen=34.82K
|
| 30 |
+
2026-03-13 14:24:10,308 | INFO | Train step | step=18 loss=8.4399 lr=0.000186 tok_per_sec=2,735 grad_norm=1.6393 tokens_seen=36.86K
|
| 31 |
+
2026-03-13 14:24:11,058 | INFO | Train step | step=19 loss=7.9153 lr=0.000139 tok_per_sec=2,731 grad_norm=1.1117 tokens_seen=38.91K
|
| 32 |
+
2026-03-13 14:24:11,807 | INFO | Train step | step=20 loss=7.9648 lr=0.000110 tok_per_sec=2,738 grad_norm=1.0066 tokens_seen=40.96K
|
| 33 |
+
2026-03-13 14:24:11,966 | INFO | Eval step | step=20 val_loss=8.7583 perplexity=6363.13
|
| 34 |
+
2026-03-13 14:24:24,399 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"event": "run_started", "timestamp": "2026-03-13T14:25:30", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 20, "warmup_steps": 5, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
|
| 2 |
+
{"event": "runtime_summary", "timestamp": "2026-03-13T14:25:33", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
|
| 3 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:35", "step": 1, "loss": 10.848917245864868, "lr": 0.0002, "tok_per_sec": 1738.338776128348, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.1781362920301035, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 4 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:35", "step": 2, "loss": 10.763139486312866, "lr": 0.0004, "tok_per_sec": 2202.432846935467, "grad_norm": 8.322466850280762, "tokens_seen": 4096, "elapsed_sec": 0.9298807919840328, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 5 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:36", "step": 3, "loss": 10.356749296188354, "lr": 0.0006000000000000001, "tok_per_sec": 2756.6912099758943, "grad_norm": 2.6283912658691406, "tokens_seen": 6144, "elapsed_sec": 0.7429196250159293, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 6 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:37", "step": 4, "loss": 10.376826524734497, "lr": 0.0008, "tok_per_sec": 2738.8579811402797, "grad_norm": 2.217130184173584, "tokens_seen": 8192, "elapsed_sec": 0.7477569169714116, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 7 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:38", "step": 5, "loss": 10.231549263000488, "lr": 0.001, "tok_per_sec": 2710.2101486388783, "grad_norm": 12.645347595214844, "tokens_seen": 10240, "elapsed_sec": 0.755660958995577, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 8 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:39", "step": 6, "loss": 9.938905477523804, "lr": 0.001, "tok_per_sec": 2643.8368942648644, "grad_norm": 1.7282862663269043, "tokens_seen": 12288, "elapsed_sec": 0.7746317499550059, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 9 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:39", "step": 7, "loss": 9.66855764389038, "lr": 0.0009901664203302125, "tok_per_sec": 2696.598737188916, "grad_norm": 1.9499105215072632, "tokens_seen": 14336, "elapsed_sec": 0.7594752499717288, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 10 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:40", "step": 8, "loss": 9.335453271865845, "lr": 0.0009610954559391703, "tok_per_sec": 2697.2926156977223, "grad_norm": 1.7210659980773926, "tokens_seen": 16384, "elapsed_sec": 0.7592798749683425, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 11 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:41", "step": 9, "loss": 9.267513036727905, "lr": 0.0009140576474687263, "tok_per_sec": 2700.6039931851096, "grad_norm": 1.762829065322876, "tokens_seen": 18432, "elapsed_sec": 0.7583488749805838, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 12 |
+
{"event": "train", "timestamp": "2026-03-13T14:25:42", "step": 10, "loss": 8.833672761917114, "lr": 0.0008511087728614862, "tok_per_sec": 2564.5051143205906, "grad_norm": 1.700391173362732, "tokens_seen": 20480, "elapsed_sec": 0.7985946249682456, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 13 |
+
{"event": "eval", "timestamp": "2026-03-13T14:25:42", "step": 10, "val_loss": 9.096094131469727, "perplexity": 8920.382728799992, "eval_batches": 2}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 14:25:30,936 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 14:25:30,936 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.log
|
| 3 |
+
2026-03-13 14:25:30,936 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142530.jsonl
|
| 4 |
+
2026-03-13 14:25:30,936 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
|
| 5 |
+
2026-03-13 14:25:30,936 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 14:25:30,937 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 20, 'warmup_steps': 5, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
|
| 7 |
+
2026-03-13 14:25:33,870 | INFO | Device summary | device=mps precision=fp32 compile_model=False
|
| 8 |
+
2026-03-13 14:25:33,870 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 14:25:33,871 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
|
| 10 |
+
2026-03-13 14:25:33,871 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
| 11 |
+
2026-03-13 14:25:35,050 | INFO | Train step | step=1 loss=10.8489 lr=0.000200 tok_per_sec=1,738 grad_norm=5.5737 tokens_seen=2.05K
|
| 12 |
+
2026-03-13 14:25:35,980 | INFO | Train step | step=2 loss=10.7631 lr=0.000400 tok_per_sec=2,202 grad_norm=8.3225 tokens_seen=4.10K
|
| 13 |
+
2026-03-13 14:25:36,724 | INFO | Train step | step=3 loss=10.3567 lr=0.000600 tok_per_sec=2,757 grad_norm=2.6284 tokens_seen=6.14K
|
| 14 |
+
2026-03-13 14:25:37,472 | INFO | Train step | step=4 loss=10.3768 lr=0.000800 tok_per_sec=2,739 grad_norm=2.2171 tokens_seen=8.19K
|
| 15 |
+
2026-03-13 14:25:38,228 | INFO | Train step | step=5 loss=10.2315 lr=0.001000 tok_per_sec=2,710 grad_norm=12.6453 tokens_seen=10.24K
|
| 16 |
+
2026-03-13 14:25:39,004 | INFO | Train step | step=6 loss=9.9389 lr=0.001000 tok_per_sec=2,644 grad_norm=1.7283 tokens_seen=12.29K
|
| 17 |
+
2026-03-13 14:25:39,764 | INFO | Train step | step=7 loss=9.6686 lr=0.000990 tok_per_sec=2,697 grad_norm=1.9499 tokens_seen=14.34K
|
| 18 |
+
2026-03-13 14:25:40,524 | INFO | Train step | step=8 loss=9.3355 lr=0.000961 tok_per_sec=2,697 grad_norm=1.7211 tokens_seen=16.38K
|
| 19 |
+
2026-03-13 14:25:41,283 | INFO | Train step | step=9 loss=9.2675 lr=0.000914 tok_per_sec=2,701 grad_norm=1.7628 tokens_seen=18.43K
|
| 20 |
+
2026-03-13 14:25:42,082 | INFO | Train step | step=10 loss=8.8337 lr=0.000851 tok_per_sec=2,565 grad_norm=1.7004 tokens_seen=20.48K
|
| 21 |
+
2026-03-13 14:25:42,254 | INFO | Eval step | step=10 val_loss=9.0961 perplexity=8920.38
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"event": "run_started", "timestamp": "2026-03-13T14:25:59", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 500, "warmup_steps": 50, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
|
| 2 |
+
{"event": "runtime_summary", "timestamp": "2026-03-13T14:26:03", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
|
| 3 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:04", "step": 1, "loss": 10.848917245864868, "lr": 2e-05, "tok_per_sec": 1704.8695679494026, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.2012649169773795, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 4 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:05", "step": 2, "loss": 10.897652626037598, "lr": 4e-05, "tok_per_sec": 2132.5760089557198, "grad_norm": 5.0279011726379395, "tokens_seen": 4096, "elapsed_sec": 0.9603409169940278, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 5 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:05", "step": 3, "loss": 10.785077571868896, "lr": 6e-05, "tok_per_sec": 2721.481065658625, "grad_norm": 5.114167213439941, "tokens_seen": 6144, "elapsed_sec": 0.7525314160156995, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 6 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:06", "step": 4, "loss": 10.634832620620728, "lr": 8e-05, "tok_per_sec": 2755.0287727887576, "grad_norm": 6.422860622406006, "tokens_seen": 8192, "elapsed_sec": 0.7433679169625975, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 7 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:07", "step": 5, "loss": 10.747493743896484, "lr": 0.0001, "tok_per_sec": 2752.2115918295694, "grad_norm": 6.580272197723389, "tokens_seen": 10240, "elapsed_sec": 0.7441288330010138, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 8 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:08", "step": 6, "loss": 10.633646488189697, "lr": 0.00012, "tok_per_sec": 2738.0719353969716, "grad_norm": 6.525123119354248, "tokens_seen": 12288, "elapsed_sec": 0.7479715830413625, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 9 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:08", "step": 7, "loss": 10.477944374084473, "lr": 0.00014000000000000001, "tok_per_sec": 2749.743315053932, "grad_norm": 5.189582824707031, "tokens_seen": 14336, "elapsed_sec": 0.7447967920452356, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 10 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:09", "step": 8, "loss": 10.35365605354309, "lr": 0.00016, "tok_per_sec": 2753.8837425710026, "grad_norm": 2.357203960418701, "tokens_seen": 16384, "elapsed_sec": 0.743676999991294, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 11 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:10", "step": 9, "loss": 10.339627742767334, "lr": 0.00018, "tok_per_sec": 2752.9484182072647, "grad_norm": 4.98753547668457, "tokens_seen": 18432, "elapsed_sec": 0.7439296669908799, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 12 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:11", "step": 10, "loss": 10.207262754440308, "lr": 0.0002, "tok_per_sec": 2755.2380338330304, "grad_norm": 5.554019927978516, "tokens_seen": 20480, "elapsed_sec": 0.7433114579762332, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 13 |
+
{"event": "eval", "timestamp": "2026-03-13T14:26:11", "step": 10, "val_loss": 10.418237686157227, "perplexity": 33464.40736092908, "eval_batches": 2}
|
| 14 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:26:27", "step": 10, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000010.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 20480}
|
| 15 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:28", "step": 11, "loss": 10.161789417266846, "lr": 0.00021999999999999998, "tok_per_sec": 119.14029052794488, "grad_norm": 2.177887201309204, "tokens_seen": 22528, "elapsed_sec": 17.189818750019185, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 16 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:29", "step": 12, "loss": 10.047882318496704, "lr": 0.00024, "tok_per_sec": 2680.171783579867, "grad_norm": 1.9737660884857178, "tokens_seen": 24576, "elapsed_sec": 0.7641301249968819, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 17 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:29", "step": 13, "loss": 10.105542421340942, "lr": 0.00026000000000000003, "tok_per_sec": 2702.3713362104313, "grad_norm": 4.096495151519775, "tokens_seen": 26624, "elapsed_sec": 0.7578529170132242, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 18 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:30", "step": 14, "loss": 9.940982818603516, "lr": 0.00028000000000000003, "tok_per_sec": 2722.8033630518785, "grad_norm": 1.8798285722732544, "tokens_seen": 28672, "elapsed_sec": 0.7521659579942934, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 19 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:31", "step": 15, "loss": 9.852107524871826, "lr": 0.0003, "tok_per_sec": 2738.483668417333, "grad_norm": 3.7492053508758545, "tokens_seen": 30720, "elapsed_sec": 0.7478591249673627, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 20 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:32", "step": 16, "loss": 9.768622398376465, "lr": 0.00032, "tok_per_sec": 2756.126999840051, "grad_norm": 1.8649290800094604, "tokens_seen": 32768, "elapsed_sec": 0.7430717090028338, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 21 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:32", "step": 17, "loss": 9.880046606063843, "lr": 0.00034, "tok_per_sec": 2762.6129070979177, "grad_norm": 1.8526010513305664, "tokens_seen": 34816, "elapsed_sec": 0.7413271670229733, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 22 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:33", "step": 18, "loss": 9.652766704559326, "lr": 0.00036, "tok_per_sec": 2762.0740561225734, "grad_norm": 2.2104318141937256, "tokens_seen": 36864, "elapsed_sec": 0.74147179198917, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 23 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:34", "step": 19, "loss": 9.371065139770508, "lr": 0.00037999999999999997, "tok_per_sec": 2746.3599790187623, "grad_norm": 2.0031697750091553, "tokens_seen": 38912, "elapsed_sec": 0.7457143330248073, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 24 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:35", "step": 20, "loss": 9.290096044540405, "lr": 0.0004, "tok_per_sec": 2753.183730427524, "grad_norm": 1.9113200902938843, "tokens_seen": 40960, "elapsed_sec": 0.7438660839688964, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 25 |
+
{"event": "eval", "timestamp": "2026-03-13T14:26:35", "step": 20, "val_loss": 9.618017196655273, "perplexity": 15033.212463302863, "eval_batches": 2}
|
| 26 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:26:47", "step": 20, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000020.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 40960}
|
| 27 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:48", "step": 21, "loss": 9.240976810455322, "lr": 0.00042, "tok_per_sec": 152.54166876251324, "grad_norm": 1.8867971897125244, "tokens_seen": 43008, "elapsed_sec": 13.425839749979787, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 28 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:49", "step": 22, "loss": 9.12671947479248, "lr": 0.00043999999999999996, "tok_per_sec": 2712.778906376721, "grad_norm": 1.8414427042007446, "tokens_seen": 45056, "elapsed_sec": 0.7549454160034657, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 29 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:50", "step": 23, "loss": 9.102352619171143, "lr": 0.00046, "tok_per_sec": 2708.941555033957, "grad_norm": 1.6314030885696411, "tokens_seen": 47104, "elapsed_sec": 0.7560148339834996, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 30 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:50", "step": 24, "loss": 8.811159133911133, "lr": 0.00048, "tok_per_sec": 2721.638538039314, "grad_norm": 1.8162541389465332, "tokens_seen": 49152, "elapsed_sec": 0.7524878749973141, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 31 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:51", "step": 25, "loss": 8.63177752494812, "lr": 0.0005, "tok_per_sec": 2726.8041484896708, "grad_norm": 1.7629377841949463, "tokens_seen": 51200, "elapsed_sec": 0.7510623750276864, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 32 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:52", "step": 26, "loss": 8.722702026367188, "lr": 0.0005200000000000001, "tok_per_sec": 2722.9481650192897, "grad_norm": 1.66167414188385, "tokens_seen": 53248, "elapsed_sec": 0.7521259590284899, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 33 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:53", "step": 27, "loss": 8.609044075012207, "lr": 0.00054, "tok_per_sec": 2753.0232007676027, "grad_norm": 1.4688063859939575, "tokens_seen": 55296, "elapsed_sec": 0.7439094590372406, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 34 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:53", "step": 28, "loss": 8.890318632125854, "lr": 0.0005600000000000001, "tok_per_sec": 2748.212475197479, "grad_norm": 1.3299572467803955, "tokens_seen": 57344, "elapsed_sec": 0.745211667031981, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 35 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:54", "step": 29, "loss": 8.266993045806885, "lr": 0.00058, "tok_per_sec": 2729.828001611049, "grad_norm": 1.4132530689239502, "tokens_seen": 59392, "elapsed_sec": 0.7502304170047864, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 36 |
+
{"event": "train", "timestamp": "2026-03-13T14:26:55", "step": 30, "loss": 8.5481858253479, "lr": 0.0006, "tok_per_sec": 2732.970411338659, "grad_norm": 1.6723191738128662, "tokens_seen": 61440, "elapsed_sec": 0.7493677909951657, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 37 |
+
{"event": "eval", "timestamp": "2026-03-13T14:26:55", "step": 30, "val_loss": 8.843989849090576, "perplexity": 6932.597552057813, "eval_batches": 2}
|
| 38 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:27:06", "step": 30, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000030.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 61440}
|
| 39 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:07", "step": 31, "loss": 8.099784016609192, "lr": 0.00062, "tok_per_sec": 168.2314308371399, "grad_norm": 1.3632475137710571, "tokens_seen": 63488, "elapsed_sec": 12.173706125002354, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 40 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:08", "step": 32, "loss": 8.278108835220337, "lr": 0.00064, "tok_per_sec": 2688.7908394379037, "grad_norm": 1.1354058980941772, "tokens_seen": 65536, "elapsed_sec": 0.7616806669975631, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 41 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:08", "step": 33, "loss": 8.04857063293457, "lr": 0.00066, "tok_per_sec": 2695.7795378460673, "grad_norm": 0.8978219032287598, "tokens_seen": 67584, "elapsed_sec": 0.7597060409607366, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 42 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:09", "step": 34, "loss": 8.70958948135376, "lr": 0.00068, "tok_per_sec": 2707.39823225129, "grad_norm": 1.751259446144104, "tokens_seen": 69632, "elapsed_sec": 0.756445791979786, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 43 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:10", "step": 35, "loss": 8.077706575393677, "lr": 0.0007000000000000001, "tok_per_sec": 2689.526034181471, "grad_norm": 0.9328188896179199, "tokens_seen": 71680, "elapsed_sec": 0.761472457961645, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 44 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:11", "step": 36, "loss": 8.057007431983948, "lr": 0.00072, "tok_per_sec": 2729.9771946058904, "grad_norm": 0.7004730701446533, "tokens_seen": 73728, "elapsed_sec": 0.7501894169836305, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 45 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:11", "step": 37, "loss": 8.280940413475037, "lr": 0.00074, "tok_per_sec": 2712.4312913681806, "grad_norm": 0.8498008251190186, "tokens_seen": 75776, "elapsed_sec": 0.7550421669729985, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 46 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:12", "step": 38, "loss": 8.420085191726685, "lr": 0.0007599999999999999, "tok_per_sec": 2728.334548153266, "grad_norm": 0.9405263662338257, "tokens_seen": 77824, "elapsed_sec": 0.7506410829955712, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 47 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:13", "step": 39, "loss": 8.040002822875977, "lr": 0.00078, "tok_per_sec": 2734.281540014069, "grad_norm": 0.8642140030860901, "tokens_seen": 79872, "elapsed_sec": 0.7490084579912946, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 48 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:14", "step": 40, "loss": 8.193370580673218, "lr": 0.0008, "tok_per_sec": 2746.819492491367, "grad_norm": 0.9126524329185486, "tokens_seen": 81920, "elapsed_sec": 0.7455895830062218, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 49 |
+
{"event": "eval", "timestamp": "2026-03-13T14:27:14", "step": 40, "val_loss": 8.36504077911377, "perplexity": 4294.2868516794715, "eval_batches": 2}
|
| 50 |
+
{"event": "checkpoint", "timestamp": "2026-03-13T14:27:26", "step": 40, "step_checkpoint": "checkpoints/pretrain_mps_dryrun/step_0000040.pt", "last_checkpoint": "checkpoints/pretrain_mps_dryrun/last.pt", "tokens_seen": 81920}
|
| 51 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:27", "step": 41, "loss": 7.95119035243988, "lr": 0.00082, "tok_per_sec": 156.35387795373205, "grad_norm": 1.0787891149520874, "tokens_seen": 83968, "elapsed_sec": 13.098491874989122, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 52 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:28", "step": 42, "loss": 7.986739635467529, "lr": 0.00084, "tok_per_sec": 2722.8155804265352, "grad_norm": 0.9517979621887207, "tokens_seen": 86016, "elapsed_sec": 0.7521625829976983, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 53 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:28", "step": 43, "loss": 7.984379172325134, "lr": 0.0008600000000000001, "tok_per_sec": 2734.609061278845, "grad_norm": 0.9767814874649048, "tokens_seen": 88064, "elapsed_sec": 0.7489187500323169, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 54 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:29", "step": 44, "loss": 7.784951090812683, "lr": 0.0008799999999999999, "tok_per_sec": 2551.6733064235736, "grad_norm": 0.9357463717460632, "tokens_seen": 90112, "elapsed_sec": 0.8026105829630978, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 55 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:30", "step": 45, "loss": 8.117401361465454, "lr": 0.0009, "tok_per_sec": 2731.190447061073, "grad_norm": 0.7716737985610962, "tokens_seen": 92160, "elapsed_sec": 0.7498561670072377, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 56 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:31", "step": 46, "loss": 8.220598220825195, "lr": 0.00092, "tok_per_sec": 2729.7315806190586, "grad_norm": 0.7731218338012695, "tokens_seen": 94208, "elapsed_sec": 0.7502569170319475, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 57 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:31", "step": 47, "loss": 8.069997072219849, "lr": 0.00094, "tok_per_sec": 2718.9204771868167, "grad_norm": 1.0135213136672974, "tokens_seen": 96256, "elapsed_sec": 0.7532401249627583, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 58 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:32", "step": 48, "loss": 7.909337043762207, "lr": 0.00096, "tok_per_sec": 2717.9027900424403, "grad_norm": 1.088800072669983, "tokens_seen": 98304, "elapsed_sec": 0.7535221669822931, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 59 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:33", "step": 49, "loss": 7.95028281211853, "lr": 0.00098, "tok_per_sec": 2721.736196494166, "grad_norm": 1.794154167175293, "tokens_seen": 100352, "elapsed_sec": 0.7524608750245534, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 60 |
+
{"event": "train", "timestamp": "2026-03-13T14:27:34", "step": 50, "loss": 8.580274820327759, "lr": 0.001, "tok_per_sec": 2714.6346388455563, "grad_norm": 1.2600723505020142, "tokens_seen": 102400, "elapsed_sec": 0.7544293330283836, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 61 |
+
{"event": "eval", "timestamp": "2026-03-13T14:27:34", "step": 50, "val_loss": 8.607748031616211, "perplexity": 5473.907720171149, "eval_batches": 2}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 14:25:59,997 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 14:25:59,997 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.log
|
| 3 |
+
2026-03-13 14:25:59,997 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_142559.jsonl
|
| 4 |
+
2026-03-13 14:25:59,997 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
|
| 5 |
+
2026-03-13 14:25:59,997 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 14:25:59,997 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 500, 'warmup_steps': 50, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
|
| 7 |
+
2026-03-13 14:26:03,019 | INFO | Device summary | device=mps precision=fp32 compile_model=False
|
| 8 |
+
2026-03-13 14:26:03,020 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 14:26:03,020 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
|
| 10 |
+
2026-03-13 14:26:03,020 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
| 11 |
+
2026-03-13 14:26:04,222 | INFO | Train step | step=1 loss=10.8489 lr=0.000020 tok_per_sec=1,705 grad_norm=5.5737 tokens_seen=2.05K
|
| 12 |
+
2026-03-13 14:26:05,183 | INFO | Train step | step=2 loss=10.8977 lr=0.000040 tok_per_sec=2,133 grad_norm=5.0279 tokens_seen=4.10K
|
| 13 |
+
2026-03-13 14:26:05,936 | INFO | Train step | step=3 loss=10.7851 lr=0.000060 tok_per_sec=2,721 grad_norm=5.1142 tokens_seen=6.14K
|
| 14 |
+
2026-03-13 14:26:06,680 | INFO | Train step | step=4 loss=10.6348 lr=0.000080 tok_per_sec=2,755 grad_norm=6.4229 tokens_seen=8.19K
|
| 15 |
+
2026-03-13 14:26:07,424 | INFO | Train step | step=5 loss=10.7475 lr=0.000100 tok_per_sec=2,752 grad_norm=6.5803 tokens_seen=10.24K
|
| 16 |
+
2026-03-13 14:26:08,173 | INFO | Train step | step=6 loss=10.6336 lr=0.000120 tok_per_sec=2,738 grad_norm=6.5251 tokens_seen=12.29K
|
| 17 |
+
2026-03-13 14:26:08,918 | INFO | Train step | step=7 loss=10.4779 lr=0.000140 tok_per_sec=2,750 grad_norm=5.1896 tokens_seen=14.34K
|
| 18 |
+
2026-03-13 14:26:09,663 | INFO | Train step | step=8 loss=10.3537 lr=0.000160 tok_per_sec=2,754 grad_norm=2.3572 tokens_seen=16.38K
|
| 19 |
+
2026-03-13 14:26:10,408 | INFO | Train step | step=9 loss=10.3396 lr=0.000180 tok_per_sec=2,753 grad_norm=4.9875 tokens_seen=18.43K
|
| 20 |
+
2026-03-13 14:26:11,152 | INFO | Train step | step=10 loss=10.2073 lr=0.000200 tok_per_sec=2,755 grad_norm=5.5540 tokens_seen=20.48K
|
| 21 |
+
2026-03-13 14:26:11,326 | INFO | Eval step | step=10 val_loss=10.4182 perplexity=33464.41
|
| 22 |
+
2026-03-13 14:26:27,213 | INFO | Checkpoint saved | step=10 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000010.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
| 23 |
+
2026-03-13 14:26:28,342 | INFO | Train step | step=11 loss=10.1618 lr=0.000220 tok_per_sec=119 grad_norm=2.1779 tokens_seen=22.53K
|
| 24 |
+
2026-03-13 14:26:29,107 | INFO | Train step | step=12 loss=10.0479 lr=0.000240 tok_per_sec=2,680 grad_norm=1.9738 tokens_seen=24.58K
|
| 25 |
+
2026-03-13 14:26:29,865 | INFO | Train step | step=13 loss=10.1055 lr=0.000260 tok_per_sec=2,702 grad_norm=4.0965 tokens_seen=26.62K
|
| 26 |
+
2026-03-13 14:26:30,617 | INFO | Train step | step=14 loss=9.9410 lr=0.000280 tok_per_sec=2,723 grad_norm=1.8798 tokens_seen=28.67K
|
| 27 |
+
2026-03-13 14:26:31,366 | INFO | Train step | step=15 loss=9.8521 lr=0.000300 tok_per_sec=2,738 grad_norm=3.7492 tokens_seen=30.72K
|
| 28 |
+
2026-03-13 14:26:32,109 | INFO | Train step | step=16 loss=9.7686 lr=0.000320 tok_per_sec=2,756 grad_norm=1.8649 tokens_seen=32.77K
|
| 29 |
+
2026-03-13 14:26:32,851 | INFO | Train step | step=17 loss=9.8800 lr=0.000340 tok_per_sec=2,763 grad_norm=1.8526 tokens_seen=34.82K
|
| 30 |
+
2026-03-13 14:26:33,593 | INFO | Train step | step=18 loss=9.6528 lr=0.000360 tok_per_sec=2,762 grad_norm=2.2104 tokens_seen=36.86K
|
| 31 |
+
2026-03-13 14:26:34,339 | INFO | Train step | step=19 loss=9.3711 lr=0.000380 tok_per_sec=2,746 grad_norm=2.0032 tokens_seen=38.91K
|
| 32 |
+
2026-03-13 14:26:35,084 | INFO | Train step | step=20 loss=9.2901 lr=0.000400 tok_per_sec=2,753 grad_norm=1.9113 tokens_seen=40.96K
|
| 33 |
+
2026-03-13 14:26:35,243 | INFO | Eval step | step=20 val_loss=9.6180 perplexity=15033.21
|
| 34 |
+
2026-03-13 14:26:47,586 | INFO | Checkpoint saved | step=20 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000020.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
| 35 |
+
2026-03-13 14:26:48,510 | INFO | Train step | step=21 loss=9.2410 lr=0.000420 tok_per_sec=153 grad_norm=1.8868 tokens_seen=43.01K
|
| 36 |
+
2026-03-13 14:26:49,266 | INFO | Train step | step=22 loss=9.1267 lr=0.000440 tok_per_sec=2,713 grad_norm=1.8414 tokens_seen=45.06K
|
| 37 |
+
2026-03-13 14:26:50,022 | INFO | Train step | step=23 loss=9.1024 lr=0.000460 tok_per_sec=2,709 grad_norm=1.6314 tokens_seen=47.10K
|
| 38 |
+
2026-03-13 14:26:50,775 | INFO | Train step | step=24 loss=8.8112 lr=0.000480 tok_per_sec=2,722 grad_norm=1.8163 tokens_seen=49.15K
|
| 39 |
+
2026-03-13 14:26:51,527 | INFO | Train step | step=25 loss=8.6318 lr=0.000500 tok_per_sec=2,727 grad_norm=1.7629 tokens_seen=51.20K
|
| 40 |
+
2026-03-13 14:26:52,279 | INFO | Train step | step=26 loss=8.7227 lr=0.000520 tok_per_sec=2,723 grad_norm=1.6617 tokens_seen=53.25K
|
| 41 |
+
2026-03-13 14:26:53,024 | INFO | Train step | step=27 loss=8.6090 lr=0.000540 tok_per_sec=2,753 grad_norm=1.4688 tokens_seen=55.30K
|
| 42 |
+
2026-03-13 14:26:53,770 | INFO | Train step | step=28 loss=8.8903 lr=0.000560 tok_per_sec=2,748 grad_norm=1.3300 tokens_seen=57.34K
|
| 43 |
+
2026-03-13 14:26:54,520 | INFO | Train step | step=29 loss=8.2670 lr=0.000580 tok_per_sec=2,730 grad_norm=1.4133 tokens_seen=59.39K
|
| 44 |
+
2026-03-13 14:26:55,270 | INFO | Train step | step=30 loss=8.5482 lr=0.000600 tok_per_sec=2,733 grad_norm=1.6723 tokens_seen=61.44K
|
| 45 |
+
2026-03-13 14:26:55,430 | INFO | Eval step | step=30 val_loss=8.8440 perplexity=6932.60
|
| 46 |
+
2026-03-13 14:27:06,544 | INFO | Checkpoint saved | step=30 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000030.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
| 47 |
+
2026-03-13 14:27:07,445 | INFO | Train step | step=31 loss=8.0998 lr=0.000620 tok_per_sec=168 grad_norm=1.3632 tokens_seen=63.49K
|
| 48 |
+
2026-03-13 14:27:08,207 | INFO | Train step | step=32 loss=8.2781 lr=0.000640 tok_per_sec=2,689 grad_norm=1.1354 tokens_seen=65.54K
|
| 49 |
+
2026-03-13 14:27:08,967 | INFO | Train step | step=33 loss=8.0486 lr=0.000660 tok_per_sec=2,696 grad_norm=0.8978 tokens_seen=67.58K
|
| 50 |
+
2026-03-13 14:27:09,724 | INFO | Train step | step=34 loss=8.7096 lr=0.000680 tok_per_sec=2,707 grad_norm=1.7513 tokens_seen=69.63K
|
| 51 |
+
2026-03-13 14:27:10,486 | INFO | Train step | step=35 loss=8.0777 lr=0.000700 tok_per_sec=2,690 grad_norm=0.9328 tokens_seen=71.68K
|
| 52 |
+
2026-03-13 14:27:11,237 | INFO | Train step | step=36 loss=8.0570 lr=0.000720 tok_per_sec=2,730 grad_norm=0.7005 tokens_seen=73.73K
|
| 53 |
+
2026-03-13 14:27:11,993 | INFO | Train step | step=37 loss=8.2809 lr=0.000740 tok_per_sec=2,712 grad_norm=0.8498 tokens_seen=75.78K
|
| 54 |
+
2026-03-13 14:27:12,744 | INFO | Train step | step=38 loss=8.4201 lr=0.000760 tok_per_sec=2,728 grad_norm=0.9405 tokens_seen=77.82K
|
| 55 |
+
2026-03-13 14:27:13,493 | INFO | Train step | step=39 loss=8.0400 lr=0.000780 tok_per_sec=2,734 grad_norm=0.8642 tokens_seen=79.87K
|
| 56 |
+
2026-03-13 14:27:14,239 | INFO | Train step | step=40 loss=8.1934 lr=0.000800 tok_per_sec=2,747 grad_norm=0.9127 tokens_seen=81.92K
|
| 57 |
+
2026-03-13 14:27:14,399 | INFO | Eval step | step=40 val_loss=8.3650 perplexity=4294.29
|
| 58 |
+
2026-03-13 14:27:26,408 | INFO | Checkpoint saved | step=40 step_checkpoint=checkpoints/pretrain_mps_dryrun/step_0000040.pt last_checkpoint=checkpoints/pretrain_mps_dryrun/last.pt
|
| 59 |
+
2026-03-13 14:27:27,338 | INFO | Train step | step=41 loss=7.9512 lr=0.000820 tok_per_sec=156 grad_norm=1.0788 tokens_seen=83.97K
|
| 60 |
+
2026-03-13 14:27:28,091 | INFO | Train step | step=42 loss=7.9867 lr=0.000840 tok_per_sec=2,723 grad_norm=0.9518 tokens_seen=86.02K
|
| 61 |
+
2026-03-13 14:27:28,841 | INFO | Train step | step=43 loss=7.9844 lr=0.000860 tok_per_sec=2,735 grad_norm=0.9768 tokens_seen=88.06K
|
| 62 |
+
2026-03-13 14:27:29,644 | INFO | Train step | step=44 loss=7.7850 lr=0.000880 tok_per_sec=2,552 grad_norm=0.9357 tokens_seen=90.11K
|
| 63 |
+
2026-03-13 14:27:30,394 | INFO | Train step | step=45 loss=8.1174 lr=0.000900 tok_per_sec=2,731 grad_norm=0.7717 tokens_seen=92.16K
|
| 64 |
+
2026-03-13 14:27:31,145 | INFO | Train step | step=46 loss=8.2206 lr=0.000920 tok_per_sec=2,730 grad_norm=0.7731 tokens_seen=94.21K
|
| 65 |
+
2026-03-13 14:27:31,898 | INFO | Train step | step=47 loss=8.0700 lr=0.000940 tok_per_sec=2,719 grad_norm=1.0135 tokens_seen=96.26K
|
| 66 |
+
2026-03-13 14:27:32,652 | INFO | Train step | step=48 loss=7.9093 lr=0.000960 tok_per_sec=2,718 grad_norm=1.0888 tokens_seen=98.30K
|
| 67 |
+
2026-03-13 14:27:33,406 | INFO | Train step | step=49 loss=7.9503 lr=0.000980 tok_per_sec=2,722 grad_norm=1.7942 tokens_seen=100.35K
|
| 68 |
+
2026-03-13 14:27:34,161 | INFO | Train step | step=50 loss=8.5803 lr=0.001000 tok_per_sec=2,715 grad_norm=1.2601 tokens_seen=102.40K
|
| 69 |
+
2026-03-13 14:27:34,323 | INFO | Eval step | step=50 val_loss=8.6077 perplexity=5473.91
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"event": "run_started", "timestamp": "2026-03-13T14:30:14", "log_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log", "metrics_path": "outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl", "model_config": {"vocab_size": 49152, "max_seq_len": 8192, "d_model": 384, "n_layers": 32, "n_heads": 6, "ffn_hidden_dim": 1024, "rope_theta": 10000.0, "rms_norm_eps": 1e-05, "initializer_range": 0.02, "dropout": 0.0, "tie_word_embeddings": true, "bias": false, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2}, "train_config": {"seed": 42, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val", "output_dir": "outputs/pretrain_mps_dryrun", "checkpoint_dir": "checkpoints/pretrain_mps_dryrun", "init_from": null, "resume_from": null, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "max_steps": 500, "warmup_steps": 50, "learning_rate": 0.001, "min_lr": 0.0001, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "precision": "fp32", "num_workers": 0, "log_interval": 1, "eval_interval": 10, "eval_batches": 2, "save_interval": 10, "compile_model": false}, "args": {"model_config": "configs/model_70m.json", "train_config": "configs/pretrain_mps_dryrun.json", "max_steps_override": null}}
|
| 2 |
+
{"event": "runtime_summary", "timestamp": "2026-03-13T14:30:17", "device": "mps", "precision": "fp32", "compile_model": false, "parameters": 75571584, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4, "tokens_per_step": 2048, "num_train_shards": 100, "train_dir": "data/pretokenized/train", "val_dir": "data/pretokenized/val"}
|
| 3 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:18", "step": 1, "loss": 10.848917245864868, "lr": 2e-05, "tok_per_sec": 1662.7320321814002, "grad_norm": 5.573695659637451, "tokens_seen": 2048, "elapsed_sec": 1.2317077919724397, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 4 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:19", "step": 2, "loss": 10.897652626037598, "lr": 4e-05, "tok_per_sec": 2559.899335289762, "grad_norm": 5.0279011726379395, "tokens_seen": 4096, "elapsed_sec": 0.8000314589589834, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 5 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:19", "step": 3, "loss": 10.785077333450317, "lr": 6e-05, "tok_per_sec": 2701.710782349432, "grad_norm": 5.1141676902771, "tokens_seen": 6144, "elapsed_sec": 0.7580382080050185, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 6 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:20", "step": 4, "loss": 10.634832859039307, "lr": 8e-05, "tok_per_sec": 2712.1050200194168, "grad_norm": 6.422860622406006, "tokens_seen": 8192, "elapsed_sec": 0.7551329999696463, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 7 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:21", "step": 5, "loss": 10.747493743896484, "lr": 0.0001, "tok_per_sec": 2709.8966635877678, "grad_norm": 6.580272674560547, "tokens_seen": 10240, "elapsed_sec": 0.7557483750279061, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 8 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:22", "step": 6, "loss": 10.633646488189697, "lr": 0.00012, "tok_per_sec": 2711.349354437794, "grad_norm": 6.525122165679932, "tokens_seen": 12288, "elapsed_sec": 0.7553434590226971, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 9 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:22", "step": 7, "loss": 10.477944374084473, "lr": 0.00014000000000000001, "tok_per_sec": 2702.4625638908824, "grad_norm": 5.189583778381348, "tokens_seen": 14336, "elapsed_sec": 0.7578273339895532, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 10 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:23", "step": 8, "loss": 10.35365605354309, "lr": 0.00016, "tok_per_sec": 2709.4527031511175, "grad_norm": 2.357203960418701, "tokens_seen": 16384, "elapsed_sec": 0.7558722090325318, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 11 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:24", "step": 9, "loss": 10.339627742767334, "lr": 0.00018, "tok_per_sec": 2711.198605030451, "grad_norm": 4.987534999847412, "tokens_seen": 18432, "elapsed_sec": 0.7553854580037296, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 12 |
+
{"event": "train", "timestamp": "2026-03-13T14:30:25", "step": 10, "loss": 10.207262754440308, "lr": 0.0002, "tok_per_sec": 2718.8463334608755, "grad_norm": 5.554019451141357, "tokens_seen": 20480, "elapsed_sec": 0.753260666038841, "seq_len": 512, "micro_batch_size": 1, "grad_accum_steps": 4}
|
| 13 |
+
{"event": "eval", "timestamp": "2026-03-13T14:30:25", "step": 10, "val_loss": 10.418238162994385, "perplexity": 33464.423318005785, "eval_batches": 2}
|
outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 14:30:14,410 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 14:30:14,410 | INFO | Log file: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.log
|
| 3 |
+
2026-03-13 14:30:14,410 | INFO | Metrics JSONL: outputs/pretrain_mps_dryrun/logs/train_pretrain_20260313_143014.jsonl
|
| 4 |
+
2026-03-13 14:30:14,410 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_mps_dryrun.json max_steps_override=None
|
| 5 |
+
2026-03-13 14:30:14,410 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 14:30:14,410 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_mps_dryrun', 'checkpoint_dir': 'checkpoints/pretrain_mps_dryrun', 'init_from': None, 'resume_from': None, 'seq_len': 512, 'micro_batch_size': 1, 'grad_accum_steps': 4, 'max_steps': 500, 'warmup_steps': 50, 'learning_rate': 0.001, 'min_lr': 0.0001, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'fp32', 'num_workers': 0, 'log_interval': 1, 'eval_interval': 10, 'eval_batches': 2, 'save_interval': 10, 'compile_model': False}
|
| 7 |
+
2026-03-13 14:30:17,140 | INFO | Device summary | device=mps precision=fp32 compile_model=False
|
| 8 |
+
2026-03-13 14:30:17,141 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 14:30:17,141 | INFO | Batch summary | seq_len=512 micro_batch_size=1 grad_accum_steps=4 tokens_per_step=2,048
|
| 10 |
+
2026-03-13 14:30:17,141 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
| 11 |
+
2026-03-13 14:30:18,374 | INFO | Train step | step=1 loss=10.8489 lr=0.000020 tok_per_sec=1,663 grad_norm=5.5737 tokens_seen=2.05K
|
| 12 |
+
2026-03-13 14:30:19,174 | INFO | Train step | step=2 loss=10.8977 lr=0.000040 tok_per_sec=2,560 grad_norm=5.0279 tokens_seen=4.10K
|
| 13 |
+
2026-03-13 14:30:19,933 | INFO | Train step | step=3 loss=10.7851 lr=0.000060 tok_per_sec=2,702 grad_norm=5.1142 tokens_seen=6.14K
|
| 14 |
+
2026-03-13 14:30:20,688 | INFO | Train step | step=4 loss=10.6348 lr=0.000080 tok_per_sec=2,712 grad_norm=6.4229 tokens_seen=8.19K
|
| 15 |
+
2026-03-13 14:30:21,445 | INFO | Train step | step=5 loss=10.7475 lr=0.000100 tok_per_sec=2,710 grad_norm=6.5803 tokens_seen=10.24K
|
| 16 |
+
2026-03-13 14:30:22,201 | INFO | Train step | step=6 loss=10.6336 lr=0.000120 tok_per_sec=2,711 grad_norm=6.5251 tokens_seen=12.29K
|
| 17 |
+
2026-03-13 14:30:22,959 | INFO | Train step | step=7 loss=10.4779 lr=0.000140 tok_per_sec=2,702 grad_norm=5.1896 tokens_seen=14.34K
|
| 18 |
+
2026-03-13 14:30:23,715 | INFO | Train step | step=8 loss=10.3537 lr=0.000160 tok_per_sec=2,709 grad_norm=2.3572 tokens_seen=16.38K
|
| 19 |
+
2026-03-13 14:30:24,471 | INFO | Train step | step=9 loss=10.3396 lr=0.000180 tok_per_sec=2,711 grad_norm=4.9875 tokens_seen=18.43K
|
| 20 |
+
2026-03-13 14:30:25,225 | INFO | Train step | step=10 loss=10.2073 lr=0.000200 tok_per_sec=2,719 grad_norm=5.5540 tokens_seen=20.48K
|
| 21 |
+
2026-03-13 14:30:25,394 | INFO | Eval step | step=10 val_loss=10.4182 perplexity=33464.42
|
outputs/pretrain_mps_dryrun/run_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_config": {
|
| 3 |
+
"vocab_size": 49152,
|
| 4 |
+
"max_seq_len": 8192,
|
| 5 |
+
"d_model": 384,
|
| 6 |
+
"n_layers": 32,
|
| 7 |
+
"n_heads": 6,
|
| 8 |
+
"ffn_hidden_dim": 1024,
|
| 9 |
+
"rope_theta": 10000.0,
|
| 10 |
+
"rms_norm_eps": 1e-05,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"dropout": 0.0,
|
| 13 |
+
"tie_word_embeddings": true,
|
| 14 |
+
"bias": false,
|
| 15 |
+
"pad_token_id": 0,
|
| 16 |
+
"bos_token_id": 1,
|
| 17 |
+
"eos_token_id": 2
|
| 18 |
+
},
|
| 19 |
+
"train_config": {
|
| 20 |
+
"seed": 42,
|
| 21 |
+
"train_dir": "data/pretokenized/train",
|
| 22 |
+
"val_dir": "data/pretokenized/val",
|
| 23 |
+
"output_dir": "outputs/pretrain_mps_dryrun",
|
| 24 |
+
"checkpoint_dir": "checkpoints/pretrain_mps_dryrun",
|
| 25 |
+
"init_from": null,
|
| 26 |
+
"resume_from": null,
|
| 27 |
+
"seq_len": 512,
|
| 28 |
+
"micro_batch_size": 1,
|
| 29 |
+
"grad_accum_steps": 4,
|
| 30 |
+
"max_steps": 500,
|
| 31 |
+
"warmup_steps": 50,
|
| 32 |
+
"learning_rate": 0.001,
|
| 33 |
+
"min_lr": 0.0001,
|
| 34 |
+
"weight_decay": 0.1,
|
| 35 |
+
"beta1": 0.9,
|
| 36 |
+
"beta2": 0.95,
|
| 37 |
+
"grad_clip": 1.0,
|
| 38 |
+
"precision": "fp32",
|
| 39 |
+
"num_workers": 0,
|
| 40 |
+
"log_interval": 1,
|
| 41 |
+
"eval_interval": 10,
|
| 42 |
+
"eval_batches": 2,
|
| 43 |
+
"save_interval": 10,
|
| 44 |
+
"compile_model": false
|
| 45 |
+
}
|
| 46 |
+
}
|
outputs/pretrain_stage1/.ipynb_checkpoints/run_config-checkpoint.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_config": {
|
| 3 |
+
"vocab_size": 49152,
|
| 4 |
+
"max_seq_len": 8192,
|
| 5 |
+
"d_model": 384,
|
| 6 |
+
"n_layers": 32,
|
| 7 |
+
"n_heads": 6,
|
| 8 |
+
"ffn_hidden_dim": 1024,
|
| 9 |
+
"rope_theta": 10000.0,
|
| 10 |
+
"rms_norm_eps": 1e-05,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"dropout": 0.0,
|
| 13 |
+
"tie_word_embeddings": true,
|
| 14 |
+
"bias": false,
|
| 15 |
+
"pad_token_id": 0,
|
| 16 |
+
"bos_token_id": 1,
|
| 17 |
+
"eos_token_id": 2
|
| 18 |
+
},
|
| 19 |
+
"train_config": {
|
| 20 |
+
"seed": 42,
|
| 21 |
+
"train_dir": "data/pretokenized/train",
|
| 22 |
+
"val_dir": "data/pretokenized/val",
|
| 23 |
+
"output_dir": "outputs/pretrain_stage1",
|
| 24 |
+
"checkpoint_dir": "checkpoints/pretrain_stage1",
|
| 25 |
+
"init_from": null,
|
| 26 |
+
"resume_from": null,
|
| 27 |
+
"seq_len": 2048,
|
| 28 |
+
"micro_batch_size": 8,
|
| 29 |
+
"grad_accum_steps": 32,
|
| 30 |
+
"max_steps": 20000,
|
| 31 |
+
"warmup_steps": 2000,
|
| 32 |
+
"learning_rate": 0.003,
|
| 33 |
+
"min_lr": 0.0003,
|
| 34 |
+
"weight_decay": 0.1,
|
| 35 |
+
"beta1": 0.9,
|
| 36 |
+
"beta2": 0.95,
|
| 37 |
+
"grad_clip": 1.0,
|
| 38 |
+
"precision": "bf16",
|
| 39 |
+
"num_workers": 0,
|
| 40 |
+
"log_interval": 10,
|
| 41 |
+
"eval_interval": 250,
|
| 42 |
+
"eval_batches": 50,
|
| 43 |
+
"save_interval": 100,
|
| 44 |
+
"compile_model": false
|
| 45 |
+
}
|
| 46 |
+
}
|
outputs/pretrain_stage1/logs/.ipynb_checkpoints/train_pretrain_20260313_152202-checkpoint.log
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-03-13 15:22:02,275 | INFO | Pretraining started
|
| 2 |
+
2026-03-13 15:22:02,276 | INFO | Log file: outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.log
|
| 3 |
+
2026-03-13 15:22:02,276 | INFO | Metrics JSONL: outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.jsonl
|
| 4 |
+
2026-03-13 15:22:02,276 | INFO | Arguments | model_config=configs/model_70m.json train_config=configs/pretrain_5090_stage1.json max_steps_override=None
|
| 5 |
+
2026-03-13 15:22:02,276 | INFO | Model config | {'vocab_size': 49152, 'max_seq_len': 8192, 'd_model': 384, 'n_layers': 32, 'n_heads': 6, 'ffn_hidden_dim': 1024, 'rope_theta': 10000.0, 'rms_norm_eps': 1e-05, 'initializer_range': 0.02, 'dropout': 0.0, 'tie_word_embeddings': True, 'bias': False, 'pad_token_id': 0, 'bos_token_id': 1, 'eos_token_id': 2}
|
| 6 |
+
2026-03-13 15:22:02,276 | INFO | Train config | {'seed': 42, 'train_dir': 'data/pretokenized/train', 'val_dir': 'data/pretokenized/val', 'output_dir': 'outputs/pretrain_stage1', 'checkpoint_dir': 'checkpoints/pretrain_stage1', 'init_from': None, 'resume_from': None, 'seq_len': 2048, 'micro_batch_size': 8, 'grad_accum_steps': 32, 'max_steps': 20000, 'warmup_steps': 2000, 'learning_rate': 0.003, 'min_lr': 0.0003, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'precision': 'bf16', 'num_workers': 0, 'log_interval': 10, 'eval_interval': 250, 'eval_batches': 50, 'save_interval': 100, 'compile_model': False}
|
| 7 |
+
2026-03-13 15:22:05,635 | INFO | Device summary | device=cuda precision=bf16 compile_model=False
|
| 8 |
+
2026-03-13 15:22:05,636 | INFO | Model summary | parameters=75.57M
|
| 9 |
+
2026-03-13 15:22:05,636 | INFO | Batch summary | seq_len=2048 micro_batch_size=8 grad_accum_steps=32 tokens_per_step=524,288
|
| 10 |
+
2026-03-13 15:22:05,636 | INFO | Dataset summary | train_dir=data/pretokenized/train val_dir=data/pretokenized/val num_train_shards=100
|
| 11 |
+
2026-03-13 15:22:48,364 | INFO | Train step | step=10 loss=10.7962 lr=0.000015 tok_per_sec=122,709 grad_norm=2.2954 tokens_seen=5.24M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 12 |
+
2026-03-13 15:23:29,710 | INFO | Train step | step=20 loss=10.3929 lr=0.000030 tok_per_sec=126,809 grad_norm=1.6374 tokens_seen=10.49M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 13 |
+
2026-03-13 15:24:11,004 | INFO | Train step | step=30 loss=10.1422 lr=0.000045 tok_per_sec=126,967 grad_norm=1.6471 tokens_seen=15.73M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 14 |
+
2026-03-13 15:24:53,082 | INFO | Train step | step=40 loss=9.9494 lr=0.000060 tok_per_sec=124,605 grad_norm=1.5930 tokens_seen=20.97M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 15 |
+
2026-03-13 15:25:36,401 | INFO | Train step | step=50 loss=9.6967 lr=0.000075 tok_per_sec=121,032 grad_norm=1.5725 tokens_seen=26.21M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 16 |
+
2026-03-13 15:26:19,167 | INFO | Train step | step=60 loss=9.3897 lr=0.000090 tok_per_sec=122,597 grad_norm=1.5564 tokens_seen=31.46M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 17 |
+
2026-03-13 15:27:01,620 | INFO | Train step | step=70 loss=9.0575 lr=0.000105 tok_per_sec=123,501 grad_norm=1.5012 tokens_seen=36.70M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 18 |
+
2026-03-13 15:27:43,580 | INFO | Train step | step=80 loss=8.6948 lr=0.000120 tok_per_sec=124,954 grad_norm=1.5047 tokens_seen=41.94M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 19 |
+
2026-03-13 15:28:26,150 | INFO | Train step | step=90 loss=8.3511 lr=0.000135 tok_per_sec=123,163 grad_norm=1.2600 tokens_seen=47.19M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 20 |
+
2026-03-13 15:29:09,001 | INFO | Train step | step=100 loss=8.0548 lr=0.000150 tok_per_sec=122,354 grad_norm=0.9670 tokens_seen=52.43M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 21 |
+
2026-03-13 15:29:12,440 | INFO | Checkpoint saved | step=100 step_checkpoint=checkpoints/pretrain_stage1/step_0000100.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
|
| 22 |
+
2026-03-13 15:29:54,815 | INFO | Train step | step=110 loss=7.8111 lr=0.000165 tok_per_sec=114,442 grad_norm=0.7305 tokens_seen=57.67M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 23 |
+
2026-03-13 15:30:37,782 | INFO | Train step | step=120 loss=7.6241 lr=0.000180 tok_per_sec=122,024 grad_norm=0.5833 tokens_seen=62.91M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 24 |
+
2026-03-13 15:31:19,995 | INFO | Train step | step=130 loss=7.4835 lr=0.000195 tok_per_sec=124,205 grad_norm=1.0428 tokens_seen=68.16M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 25 |
+
2026-03-13 15:32:03,842 | INFO | Train step | step=140 loss=7.3397 lr=0.000210 tok_per_sec=119,576 grad_norm=0.6136 tokens_seen=73.40M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 26 |
+
2026-03-13 15:32:45,816 | INFO | Train step | step=150 loss=7.1952 lr=0.000225 tok_per_sec=124,911 grad_norm=1.2209 tokens_seen=78.64M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 27 |
+
2026-03-13 15:33:27,516 | INFO | Train step | step=160 loss=7.0569 lr=0.000240 tok_per_sec=125,732 grad_norm=0.9325 tokens_seen=83.89M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 28 |
+
2026-03-13 15:34:09,763 | INFO | Train step | step=170 loss=6.9308 lr=0.000255 tok_per_sec=124,102 grad_norm=1.1994 tokens_seen=89.13M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 29 |
+
2026-03-13 15:34:51,580 | INFO | Train step | step=180 loss=6.7975 lr=0.000270 tok_per_sec=125,380 grad_norm=1.2646 tokens_seen=94.37M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 30 |
+
2026-03-13 15:35:33,306 | INFO | Train step | step=190 loss=6.6834 lr=0.000285 tok_per_sec=125,653 grad_norm=0.9549 tokens_seen=99.61M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 31 |
+
2026-03-13 15:36:16,369 | INFO | Train step | step=200 loss=6.5762 lr=0.000300 tok_per_sec=121,752 grad_norm=1.5983 tokens_seen=104.86M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 32 |
+
2026-03-13 15:36:18,831 | INFO | Checkpoint saved | step=200 step_checkpoint=checkpoints/pretrain_stage1/step_0000200.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
|
| 33 |
+
2026-03-13 15:37:01,458 | INFO | Train step | step=210 loss=6.4800 lr=0.000315 tok_per_sec=116,281 grad_norm=0.9575 tokens_seen=110.10M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 34 |
+
2026-03-13 15:37:43,418 | INFO | Train step | step=220 loss=6.3799 lr=0.000330 tok_per_sec=124,955 grad_norm=1.0189 tokens_seen=115.34M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 35 |
+
2026-03-13 15:38:25,012 | INFO | Train step | step=230 loss=6.3007 lr=0.000345 tok_per_sec=126,050 grad_norm=1.4322 tokens_seen=120.59M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 36 |
+
2026-03-13 15:39:07,059 | INFO | Train step | step=240 loss=6.2100 lr=0.000360 tok_per_sec=124,696 grad_norm=1.4284 tokens_seen=125.83M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 37 |
+
2026-03-13 15:39:48,188 | INFO | Train step | step=250 loss=6.1378 lr=0.000375 tok_per_sec=127,476 grad_norm=0.8502 tokens_seen=131.07M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 38 |
+
2026-03-13 15:39:50,336 | INFO | Eval step | step=250 val_loss=6.1302 perplexity=459.54
|
| 39 |
+
2026-03-13 15:40:32,241 | INFO | Train step | step=260 loss=6.0712 lr=0.000390 tok_per_sec=119,017 grad_norm=1.5691 tokens_seen=136.31M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 40 |
+
2026-03-13 15:41:15,172 | INFO | Train step | step=270 loss=6.0020 lr=0.000405 tok_per_sec=122,129 grad_norm=1.3161 tokens_seen=141.56M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 41 |
+
2026-03-13 15:41:57,319 | INFO | Train step | step=280 loss=5.9392 lr=0.000420 tok_per_sec=124,398 grad_norm=1.3891 tokens_seen=146.80M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 42 |
+
2026-03-13 15:42:39,253 | INFO | Train step | step=290 loss=5.8713 lr=0.000435 tok_per_sec=125,030 grad_norm=1.1325 tokens_seen=152.04M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 43 |
+
2026-03-13 15:43:21,127 | INFO | Train step | step=300 loss=5.8109 lr=0.000450 tok_per_sec=125,209 grad_norm=1.0078 tokens_seen=157.29M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 44 |
+
2026-03-13 15:43:23,493 | INFO | Checkpoint saved | step=300 step_checkpoint=checkpoints/pretrain_stage1/step_0000300.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
|
| 45 |
+
2026-03-13 15:44:04,852 | INFO | Train step | step=310 loss=5.7384 lr=0.000465 tok_per_sec=119,907 grad_norm=1.2581 tokens_seen=162.53M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 46 |
+
2026-03-13 15:44:46,724 | INFO | Train step | step=320 loss=5.6798 lr=0.000480 tok_per_sec=125,216 grad_norm=0.9680 tokens_seen=167.77M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 47 |
+
2026-03-13 15:45:29,877 | INFO | Train step | step=330 loss=5.6204 lr=0.000495 tok_per_sec=121,497 grad_norm=1.4606 tokens_seen=173.02M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 48 |
+
2026-03-13 15:46:12,195 | INFO | Train step | step=340 loss=5.5678 lr=0.000510 tok_per_sec=123,896 grad_norm=1.2717 tokens_seen=178.26M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 49 |
+
2026-03-13 15:46:54,104 | INFO | Train step | step=350 loss=5.5266 lr=0.000525 tok_per_sec=125,105 grad_norm=1.6313 tokens_seen=183.50M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 50 |
+
2026-03-13 15:47:36,611 | INFO | Train step | step=360 loss=5.4781 lr=0.000540 tok_per_sec=123,343 grad_norm=1.0196 tokens_seen=188.74M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 51 |
+
2026-03-13 15:48:18,069 | INFO | Train step | step=370 loss=5.4230 lr=0.000555 tok_per_sec=126,468 grad_norm=1.0206 tokens_seen=193.99M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 52 |
+
2026-03-13 15:49:00,176 | INFO | Train step | step=380 loss=5.3519 lr=0.000570 tok_per_sec=124,516 grad_norm=0.7121 tokens_seen=199.23M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 53 |
+
2026-03-13 15:49:42,492 | INFO | Train step | step=390 loss=5.3026 lr=0.000585 tok_per_sec=123,899 grad_norm=1.0407 tokens_seen=204.47M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 54 |
+
2026-03-13 15:50:24,343 | INFO | Train step | step=400 loss=5.2721 lr=0.000600 tok_per_sec=125,278 grad_norm=0.7830 tokens_seen=209.72M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 55 |
+
2026-03-13 15:50:27,093 | INFO | Checkpoint saved | step=400 step_checkpoint=checkpoints/pretrain_stage1/step_0000400.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
|
| 56 |
+
2026-03-13 15:51:08,698 | INFO | Train step | step=410 loss=5.2136 lr=0.000615 tok_per_sec=118,206 grad_norm=0.6625 tokens_seen=214.96M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 57 |
+
2026-03-13 15:51:50,429 | INFO | Train step | step=420 loss=5.1839 lr=0.000630 tok_per_sec=125,640 grad_norm=1.1878 tokens_seen=220.20M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 58 |
+
2026-03-13 15:52:31,919 | INFO | Train step | step=430 loss=5.1433 lr=0.000645 tok_per_sec=126,367 grad_norm=1.0909 tokens_seen=225.44M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 59 |
+
2026-03-13 15:53:14,261 | INFO | Train step | step=440 loss=5.0811 lr=0.000660 tok_per_sec=123,827 grad_norm=1.0818 tokens_seen=230.69M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 60 |
+
2026-03-13 15:53:55,426 | INFO | Train step | step=450 loss=5.0691 lr=0.000675 tok_per_sec=127,367 grad_norm=0.8735 tokens_seen=235.93M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 61 |
+
2026-03-13 15:54:36,815 | INFO | Train step | step=460 loss=5.0245 lr=0.000690 tok_per_sec=126,676 grad_norm=0.7781 tokens_seen=241.17M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 62 |
+
2026-03-13 15:55:18,411 | INFO | Train step | step=470 loss=4.9740 lr=0.000705 tok_per_sec=126,046 grad_norm=0.8157 tokens_seen=246.42M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 63 |
+
2026-03-13 15:55:59,935 | INFO | Train step | step=480 loss=4.9158 lr=0.000720 tok_per_sec=126,265 grad_norm=0.4327 tokens_seen=251.66M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 64 |
+
2026-03-13 15:56:41,577 | INFO | Train step | step=490 loss=4.8794 lr=0.000735 tok_per_sec=125,907 grad_norm=0.9491 tokens_seen=256.90M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 65 |
+
2026-03-13 15:57:23,706 | INFO | Train step | step=500 loss=4.8574 lr=0.000750 tok_per_sec=124,451 grad_norm=0.7693 tokens_seen=262.14M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 66 |
+
2026-03-13 15:57:26,171 | INFO | Eval step | step=500 val_loss=4.8718 perplexity=130.55
|
| 67 |
+
2026-03-13 15:57:30,367 | INFO | Checkpoint saved | step=500 step_checkpoint=checkpoints/pretrain_stage1/step_0000500.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
|
| 68 |
+
2026-03-13 15:58:13,490 | INFO | Train step | step=510 loss=4.8269 lr=0.000765 tok_per_sec=105,314 grad_norm=0.8741 tokens_seen=267.39M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 69 |
+
2026-03-13 15:58:56,422 | INFO | Train step | step=520 loss=4.7787 lr=0.000780 tok_per_sec=122,122 grad_norm=0.5603 tokens_seen=272.63M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 70 |
+
2026-03-13 15:59:39,461 | INFO | Train step | step=530 loss=4.7081 lr=0.000795 tok_per_sec=121,823 grad_norm=0.7208 tokens_seen=277.87M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 71 |
+
2026-03-13 16:00:23,234 | INFO | Train step | step=540 loss=4.6785 lr=0.000810 tok_per_sec=119,776 grad_norm=0.6182 tokens_seen=283.12M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 72 |
+
2026-03-13 16:01:05,239 | INFO | Train step | step=550 loss=4.6483 lr=0.000825 tok_per_sec=124,821 grad_norm=0.8779 tokens_seen=288.36M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 73 |
+
2026-03-13 16:01:47,156 | INFO | Train step | step=560 loss=4.6100 lr=0.000840 tok_per_sec=125,080 grad_norm=0.7765 tokens_seen=293.60M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 74 |
+
2026-03-13 16:02:30,946 | INFO | Train step | step=570 loss=4.5568 lr=0.000855 tok_per_sec=119,733 grad_norm=0.5192 tokens_seen=298.84M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 75 |
+
2026-03-13 16:03:13,505 | INFO | Train step | step=580 loss=4.5020 lr=0.000870 tok_per_sec=123,194 grad_norm=0.4420 tokens_seen=304.09M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 76 |
+
2026-03-13 16:03:55,388 | INFO | Train step | step=590 loss=4.4536 lr=0.000885 tok_per_sec=125,182 grad_norm=0.4726 tokens_seen=309.33M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 77 |
+
2026-03-13 16:04:37,233 | INFO | Train step | step=600 loss=4.4008 lr=0.000900 tok_per_sec=125,295 grad_norm=0.5401 tokens_seen=314.57M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 78 |
+
2026-03-13 16:04:40,532 | INFO | Checkpoint saved | step=600 step_checkpoint=checkpoints/pretrain_stage1/step_0000600.pt last_checkpoint=checkpoints/pretrain_stage1/last.pt
|
| 79 |
+
2026-03-13 16:05:22,263 | INFO | Train step | step=610 loss=4.3697 lr=0.000915 tok_per_sec=116,433 grad_norm=0.5282 tokens_seen=319.82M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 80 |
+
2026-03-13 16:06:04,579 | INFO | Train step | step=620 loss=4.3184 lr=0.000930 tok_per_sec=123,903 grad_norm=0.8301 tokens_seen=325.06M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 81 |
+
2026-03-13 16:06:47,513 | INFO | Train step | step=630 loss=4.3098 lr=0.000945 tok_per_sec=122,117 grad_norm=0.4351 tokens_seen=330.30M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
| 82 |
+
2026-03-13 16:07:29,171 | INFO | Train step | step=640 loss=4.2368 lr=0.000960 tok_per_sec=125,858 grad_norm=0.4222 tokens_seen=335.54M mem_alloc_gb=1.28 mem_reserved_gb=23.53 max_mem_alloc_gb=19.63 max_mem_reserved_gb=23.53
|
outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/pretrain_stage1/logs/train_pretrain_20260313_152202.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/pretrain_stage1/run_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_config": {
|
| 3 |
+
"vocab_size": 49152,
|
| 4 |
+
"max_seq_len": 8192,
|
| 5 |
+
"d_model": 384,
|
| 6 |
+
"n_layers": 32,
|
| 7 |
+
"n_heads": 6,
|
| 8 |
+
"ffn_hidden_dim": 1024,
|
| 9 |
+
"rope_theta": 10000.0,
|
| 10 |
+
"rms_norm_eps": 1e-05,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"dropout": 0.0,
|
| 13 |
+
"tie_word_embeddings": true,
|
| 14 |
+
"bias": false,
|
| 15 |
+
"pad_token_id": 0,
|
| 16 |
+
"bos_token_id": 1,
|
| 17 |
+
"eos_token_id": 2
|
| 18 |
+
},
|
| 19 |
+
"train_config": {
|
| 20 |
+
"seed": 42,
|
| 21 |
+
"train_dir": "data/pretokenized/train",
|
| 22 |
+
"val_dir": "data/pretokenized/val",
|
| 23 |
+
"output_dir": "outputs/pretrain_stage1",
|
| 24 |
+
"checkpoint_dir": "checkpoints/pretrain_stage1",
|
| 25 |
+
"init_from": null,
|
| 26 |
+
"resume_from": null,
|
| 27 |
+
"seq_len": 2048,
|
| 28 |
+
"micro_batch_size": 8,
|
| 29 |
+
"grad_accum_steps": 32,
|
| 30 |
+
"max_steps": 20000,
|
| 31 |
+
"warmup_steps": 2000,
|
| 32 |
+
"learning_rate": 0.003,
|
| 33 |
+
"min_lr": 0.0003,
|
| 34 |
+
"weight_decay": 0.1,
|
| 35 |
+
"beta1": 0.9,
|
| 36 |
+
"beta2": 0.95,
|
| 37 |
+
"grad_clip": 1.0,
|
| 38 |
+
"precision": "bf16",
|
| 39 |
+
"num_workers": 0,
|
| 40 |
+
"log_interval": 10,
|
| 41 |
+
"eval_interval": 250,
|
| 42 |
+
"eval_batches": 50,
|
| 43 |
+
"save_interval": 100,
|
| 44 |
+
"compile_model": false
|
| 45 |
+
}
|
| 46 |
+
}
|
outputs/pretrain_stage2/run_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_config": {
|
| 3 |
+
"vocab_size": 49152,
|
| 4 |
+
"max_seq_len": 8192,
|
| 5 |
+
"d_model": 384,
|
| 6 |
+
"n_layers": 32,
|
| 7 |
+
"n_heads": 6,
|
| 8 |
+
"ffn_hidden_dim": 1024,
|
| 9 |
+
"rope_theta": 10000.0,
|
| 10 |
+
"rms_norm_eps": 1e-05,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"dropout": 0.0,
|
| 13 |
+
"tie_word_embeddings": true,
|
| 14 |
+
"bias": false,
|
| 15 |
+
"pad_token_id": 0,
|
| 16 |
+
"bos_token_id": 1,
|
| 17 |
+
"eos_token_id": 2
|
| 18 |
+
},
|
| 19 |
+
"train_config": {
|
| 20 |
+
"seed": 42,
|
| 21 |
+
"train_dir": "data/pretokenized/train",
|
| 22 |
+
"val_dir": "data/pretokenized/val",
|
| 23 |
+
"output_dir": "outputs/pretrain_stage2",
|
| 24 |
+
"checkpoint_dir": "checkpoints/pretrain_stage2",
|
| 25 |
+
"init_from": "checkpoints/pretrain_stage1/last.pt",
|
| 26 |
+
"resume_from": null,
|
| 27 |
+
"seq_len": 8192,
|
| 28 |
+
"micro_batch_size": 2,
|
| 29 |
+
"grad_accum_steps": 16,
|
| 30 |
+
"max_steps": 1000,
|
| 31 |
+
"warmup_steps": 100,
|
| 32 |
+
"learning_rate": 0.001,
|
| 33 |
+
"min_lr": 0.0001,
|
| 34 |
+
"weight_decay": 0.1,
|
| 35 |
+
"beta1": 0.9,
|
| 36 |
+
"beta2": 0.95,
|
| 37 |
+
"grad_clip": 1.0,
|
| 38 |
+
"precision": "bf16",
|
| 39 |
+
"num_workers": 0,
|
| 40 |
+
"log_interval": 5,
|
| 41 |
+
"eval_interval": 100,
|
| 42 |
+
"eval_batches": 20,
|
| 43 |
+
"save_interval": 50,
|
| 44 |
+
"compile_model": false
|
| 45 |
+
}
|
| 46 |
+
}
|
scripts/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
scripts/eval_perplexity.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import math
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
from torch.utils.data import DataLoader
|
| 10 |
+
|
| 11 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 12 |
+
sys.path.append(str(ROOT / "src"))
|
| 13 |
+
|
| 14 |
+
from sllm.checkpoint import load_checkpoint
|
| 15 |
+
from sllm.config import ModelConfig, load_json
|
| 16 |
+
from sllm.data import SequentialEvalDataset
|
| 17 |
+
from sllm.model import SLLMForCausalLM
|
| 18 |
+
from sllm.utils import autocast_context, get_device, resolve_runtime_precision, setup_logger
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 22 |
+
parser = argparse.ArgumentParser(description="Evaluate perplexity on validation shards.")
|
| 23 |
+
parser.add_argument("--checkpoint", required=True, help="Path to checkpoint file.")
|
| 24 |
+
parser.add_argument("--model-config", required=False, help="Optional model config JSON path.")
|
| 25 |
+
parser.add_argument("--data-dir", required=True, help="Validation root directory.")
|
| 26 |
+
parser.add_argument("--seq-len", type=int, default=2_048)
|
| 27 |
+
parser.add_argument("--batch-size", type=int, default=8)
|
| 28 |
+
parser.add_argument("--batches", type=int, default=50)
|
| 29 |
+
parser.add_argument("--precision", default="bf16")
|
| 30 |
+
return parser
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def main() -> None:
|
| 34 |
+
args = build_parser().parse_args()
|
| 35 |
+
logger, log_path = setup_logger("sllm.eval_perplexity", Path("outputs/eval"), "eval_perplexity")
|
| 36 |
+
logger.info("Perplexity evaluation started")
|
| 37 |
+
logger.info("Log file: %s", log_path)
|
| 38 |
+
logger.info("Arguments | checkpoint=%s model_config=%s data_dir=%s seq_len=%s batch_size=%s batches=%s precision=%s", args.checkpoint, args.model_config, args.data_dir, args.seq_len, args.batch_size, args.batches, args.precision)
|
| 39 |
+
device = get_device()
|
| 40 |
+
runtime_precision, precision_warning = resolve_runtime_precision(device, args.precision)
|
| 41 |
+
if precision_warning is not None:
|
| 42 |
+
logger.warning(precision_warning)
|
| 43 |
+
payload = load_checkpoint(args.checkpoint, map_location=device)
|
| 44 |
+
if args.model_config:
|
| 45 |
+
model_config = ModelConfig.from_dict(load_json(args.model_config))
|
| 46 |
+
else:
|
| 47 |
+
model_config = ModelConfig.from_dict(payload["model_config"])
|
| 48 |
+
|
| 49 |
+
model = SLLMForCausalLM(model_config).to(device)
|
| 50 |
+
model.load_state_dict(payload["model"])
|
| 51 |
+
model.eval()
|
| 52 |
+
|
| 53 |
+
dataset = SequentialEvalDataset(
|
| 54 |
+
data_dir=args.data_dir,
|
| 55 |
+
split="val",
|
| 56 |
+
seq_len=args.seq_len,
|
| 57 |
+
max_batches=args.batches * args.batch_size,
|
| 58 |
+
)
|
| 59 |
+
loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=0)
|
| 60 |
+
|
| 61 |
+
losses = []
|
| 62 |
+
with torch.no_grad():
|
| 63 |
+
for batch_index, batch in enumerate(loader):
|
| 64 |
+
if batch_index >= args.batches:
|
| 65 |
+
break
|
| 66 |
+
batch = {key: value.to(device) for key, value in batch.items()}
|
| 67 |
+
with autocast_context(device, runtime_precision):
|
| 68 |
+
loss = model(**batch)["loss"]
|
| 69 |
+
losses.append(loss.detach().float().item())
|
| 70 |
+
|
| 71 |
+
mean_loss = float(sum(losses) / max(1, len(losses)))
|
| 72 |
+
perplexity = math.exp(min(mean_loss, 20))
|
| 73 |
+
logger.info("Perplexity evaluation finished | val_loss=%.4f perplexity=%.2f", mean_loss, perplexity)
|
| 74 |
+
print(f"val_loss={mean_loss:.4f}")
|
| 75 |
+
print(f"perplexity={perplexity:.2f}")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
main()
|
scripts/generate.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from tokenizers import Tokenizer
|
| 9 |
+
|
| 10 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
+
sys.path.append(str(ROOT / "src"))
|
| 12 |
+
|
| 13 |
+
from sllm.checkpoint import load_checkpoint
|
| 14 |
+
from sllm.config import ModelConfig, load_json
|
| 15 |
+
from sllm.model import SLLMForCausalLM
|
| 16 |
+
from sllm.utils import get_device, setup_logger
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 20 |
+
parser = argparse.ArgumentParser(description="Generate text from a trained checkpoint.")
|
| 21 |
+
parser.add_argument("--checkpoint", required=True, help="Path to model checkpoint.")
|
| 22 |
+
parser.add_argument("--tokenizer-dir", required=True, help="Directory with tokenizer.json.")
|
| 23 |
+
parser.add_argument("--prompt", required=True, help="Prompt text.")
|
| 24 |
+
parser.add_argument("--max-new-tokens", type=int, default=128)
|
| 25 |
+
parser.add_argument("--temperature", type=float, default=0.8)
|
| 26 |
+
parser.add_argument("--top-k", type=int, default=50)
|
| 27 |
+
parser.add_argument("--model-config", required=False, help="Optional path to model config JSON.")
|
| 28 |
+
return parser
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def main() -> None:
|
| 32 |
+
args = build_parser().parse_args()
|
| 33 |
+
logger, log_path = setup_logger("sllm.generate", Path("outputs/generate"), "generate")
|
| 34 |
+
logger.info("Generation started")
|
| 35 |
+
logger.info("Log file: %s", log_path)
|
| 36 |
+
logger.info(
|
| 37 |
+
"Arguments | checkpoint=%s tokenizer_dir=%s max_new_tokens=%s temperature=%s top_k=%s model_config=%s",
|
| 38 |
+
args.checkpoint,
|
| 39 |
+
args.tokenizer_dir,
|
| 40 |
+
args.max_new_tokens,
|
| 41 |
+
args.temperature,
|
| 42 |
+
args.top_k,
|
| 43 |
+
args.model_config,
|
| 44 |
+
)
|
| 45 |
+
device = get_device()
|
| 46 |
+
tokenizer = Tokenizer.from_file(str(Path(args.tokenizer_dir) / "tokenizer.json"))
|
| 47 |
+
tokenizer_meta = load_json(Path(args.tokenizer_dir) / "tokenizer_meta.json")
|
| 48 |
+
specials = tokenizer_meta["special_tokens"]
|
| 49 |
+
|
| 50 |
+
payload = load_checkpoint(args.checkpoint, map_location=device)
|
| 51 |
+
if args.model_config:
|
| 52 |
+
model_config = ModelConfig.from_dict(load_json(args.model_config))
|
| 53 |
+
else:
|
| 54 |
+
model_config = ModelConfig.from_dict(payload["model_config"])
|
| 55 |
+
|
| 56 |
+
model = SLLMForCausalLM(model_config).to(device)
|
| 57 |
+
model.load_state_dict(payload["model"])
|
| 58 |
+
model.eval()
|
| 59 |
+
|
| 60 |
+
prompt_ids = [int(specials["bos_token_id"])] + tokenizer.encode(
|
| 61 |
+
args.prompt,
|
| 62 |
+
add_special_tokens=False,
|
| 63 |
+
).ids
|
| 64 |
+
input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=device)
|
| 65 |
+
|
| 66 |
+
with torch.no_grad():
|
| 67 |
+
output_ids = model.generate(
|
| 68 |
+
input_ids=input_ids,
|
| 69 |
+
max_new_tokens=args.max_new_tokens,
|
| 70 |
+
temperature=args.temperature,
|
| 71 |
+
top_k=args.top_k,
|
| 72 |
+
eos_token_id=int(specials["eos_token_id"]),
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
decoded = tokenizer.decode(output_ids[0].tolist(), skip_special_tokens=False)
|
| 76 |
+
logger.info("Generation finished | prompt_tokens=%s output_tokens=%s", len(prompt_ids), output_ids.shape[1])
|
| 77 |
+
print(decoded)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
main()
|
scripts/prepare_pretrain_data.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import math
|
| 5 |
+
import random
|
| 6 |
+
import sys
|
| 7 |
+
from collections import deque
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from datasets import load_dataset
|
| 11 |
+
from tokenizers import Tokenizer
|
| 12 |
+
|
| 13 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 14 |
+
sys.path.append(str(ROOT / "src"))
|
| 15 |
+
|
| 16 |
+
from sllm.config import DataMixConfig, load_json, save_json
|
| 17 |
+
from sllm.data import TokenShardWriter
|
| 18 |
+
from sllm.utils import setup_logger
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 22 |
+
parser = argparse.ArgumentParser(description="Tokenize and shard pretraining corpora.")
|
| 23 |
+
parser.add_argument("--data-config", required=True, help="Path to data mixture JSON config.")
|
| 24 |
+
parser.add_argument("--tokenizer-dir", required=True, help="Directory with tokenizer.json.")
|
| 25 |
+
parser.add_argument("--output-dir", required=True, help="Root directory for train/val shards.")
|
| 26 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed for dataset shuffling.")
|
| 27 |
+
return parser
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def load_tokenizer(tokenizer_dir: str | Path) -> tuple[Tokenizer, dict]:
|
| 31 |
+
tokenizer_dir = Path(tokenizer_dir)
|
| 32 |
+
tokenizer = Tokenizer.from_file(str(tokenizer_dir / "tokenizer.json"))
|
| 33 |
+
metadata = load_json(tokenizer_dir / "tokenizer_meta.json")
|
| 34 |
+
return tokenizer, metadata
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def iter_source_rows(source, seed: int):
|
| 38 |
+
dataset = load_dataset(
|
| 39 |
+
path=source.path,
|
| 40 |
+
name=source.config_name,
|
| 41 |
+
data_dir=source.data_dir,
|
| 42 |
+
split=source.split,
|
| 43 |
+
revision=source.revision,
|
| 44 |
+
streaming=source.streaming,
|
| 45 |
+
)
|
| 46 |
+
if source.streaming:
|
| 47 |
+
dataset = dataset.shuffle(seed=seed, buffer_size=source.shuffle_buffer)
|
| 48 |
+
return iter(dataset)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
TOKENIZE_BATCH_SIZE = 128
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def allocate_token_targets(data_config: DataMixConfig, total_tokens: int) -> dict[str, int]:
|
| 55 |
+
weights = data_config.normalized_weights()
|
| 56 |
+
raw_targets = {
|
| 57 |
+
source.name: total_tokens * weights[source.name]
|
| 58 |
+
for source in data_config.sources
|
| 59 |
+
}
|
| 60 |
+
base_targets = {
|
| 61 |
+
name: int(math.floor(value))
|
| 62 |
+
for name, value in raw_targets.items()
|
| 63 |
+
}
|
| 64 |
+
remainder = total_tokens - sum(base_targets.values())
|
| 65 |
+
ranked = sorted(
|
| 66 |
+
raw_targets.items(),
|
| 67 |
+
key=lambda item: (item[1] - math.floor(item[1]), item[0]),
|
| 68 |
+
reverse=True,
|
| 69 |
+
)
|
| 70 |
+
for index in range(remainder):
|
| 71 |
+
name = ranked[index % len(ranked)][0]
|
| 72 |
+
base_targets[name] += 1
|
| 73 |
+
return base_targets
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def make_source_state(source, seed: int) -> dict:
|
| 77 |
+
return {
|
| 78 |
+
"source": source,
|
| 79 |
+
"iterator": iter_source_rows(source, seed),
|
| 80 |
+
"documents_used": 0,
|
| 81 |
+
"train_tokens_written": 0,
|
| 82 |
+
"val_tokens_written": 0,
|
| 83 |
+
"exhausted": False,
|
| 84 |
+
"token_queue": deque(),
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def refill_token_queue(state: dict, tokenizer: Tokenizer) -> None:
|
| 89 |
+
if state["exhausted"]:
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
texts: list[str] = []
|
| 93 |
+
while len(texts) < TOKENIZE_BATCH_SIZE:
|
| 94 |
+
try:
|
| 95 |
+
row = next(state["iterator"])
|
| 96 |
+
except StopIteration:
|
| 97 |
+
state["exhausted"] = True
|
| 98 |
+
break
|
| 99 |
+
|
| 100 |
+
text = row.get(state["source"].text_field or "", None)
|
| 101 |
+
if not isinstance(text, str):
|
| 102 |
+
continue
|
| 103 |
+
text = text.strip()
|
| 104 |
+
if not text:
|
| 105 |
+
continue
|
| 106 |
+
texts.append(text)
|
| 107 |
+
|
| 108 |
+
if not texts:
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
encoded_batch = tokenizer.encode_batch(texts)
|
| 112 |
+
for encoded in encoded_batch:
|
| 113 |
+
token_ids = encoded.ids
|
| 114 |
+
if token_ids:
|
| 115 |
+
state["token_queue"].append(token_ids)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def next_valid_token_ids(state: dict, tokenizer: Tokenizer) -> list[int] | None:
|
| 119 |
+
while True:
|
| 120 |
+
if state["token_queue"]:
|
| 121 |
+
state["documents_used"] += 1
|
| 122 |
+
return state["token_queue"].popleft()
|
| 123 |
+
if state["exhausted"]:
|
| 124 |
+
return None
|
| 125 |
+
refill_token_queue(state, tokenizer)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def choose_source_name(states: dict[str, dict], targets: dict[str, int], split: str, rng: random.Random) -> str | None:
|
| 129 |
+
candidates = []
|
| 130 |
+
for name, state in states.items():
|
| 131 |
+
if state["exhausted"]:
|
| 132 |
+
continue
|
| 133 |
+
target = targets[name]
|
| 134 |
+
if target <= 0:
|
| 135 |
+
continue
|
| 136 |
+
written = state[f"{split}_tokens_written"]
|
| 137 |
+
if written >= target:
|
| 138 |
+
continue
|
| 139 |
+
progress = written / target
|
| 140 |
+
candidates.append((progress, rng.random(), name))
|
| 141 |
+
|
| 142 |
+
if not candidates:
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
candidates.sort(key=lambda item: (item[0], item[1]))
|
| 146 |
+
return candidates[0][2]
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def interleave_split(
|
| 150 |
+
split: str,
|
| 151 |
+
writer: TokenShardWriter,
|
| 152 |
+
states: dict[str, dict],
|
| 153 |
+
targets: dict[str, int],
|
| 154 |
+
tokenizer: Tokenizer,
|
| 155 |
+
logger,
|
| 156 |
+
rng: random.Random,
|
| 157 |
+
) -> int:
|
| 158 |
+
total_target = sum(targets.values())
|
| 159 |
+
total_written = 0
|
| 160 |
+
emitted_documents = 0
|
| 161 |
+
|
| 162 |
+
logger.info(
|
| 163 |
+
"Interleave start | split=%s total_target_tokens=%s strategy=weighted_progress_balancing",
|
| 164 |
+
split,
|
| 165 |
+
f"{total_target:,}",
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
while total_written < total_target:
|
| 169 |
+
source_name = choose_source_name(states, targets, split, rng)
|
| 170 |
+
if source_name is None:
|
| 171 |
+
raise RuntimeError(
|
| 172 |
+
f"Недостаточно данных для заполнения split={split}. "
|
| 173 |
+
"Все доступные источники исчерпаны до достижения целевого объема."
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
state = states[source_name]
|
| 177 |
+
token_ids = next_valid_token_ids(state, tokenizer)
|
| 178 |
+
if token_ids is None:
|
| 179 |
+
logger.warning("Source exhausted early | split=%s source=%s", split, source_name)
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
source_remaining = targets[source_name] - state[f"{split}_tokens_written"]
|
| 183 |
+
split_remaining = total_target - total_written
|
| 184 |
+
chunk = token_ids[: min(len(token_ids), source_remaining, split_remaining)]
|
| 185 |
+
if not chunk:
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
+
writer.add_tokens(chunk)
|
| 189 |
+
state[f"{split}_tokens_written"] += len(chunk)
|
| 190 |
+
total_written += len(chunk)
|
| 191 |
+
emitted_documents += 1
|
| 192 |
+
|
| 193 |
+
if emitted_documents % 10_000 == 0:
|
| 194 |
+
logger.info(
|
| 195 |
+
"Interleave progress | split=%s documents=%s total_tokens=%s/%s current_source=%s",
|
| 196 |
+
split,
|
| 197 |
+
f"{emitted_documents:,}",
|
| 198 |
+
f"{total_written:,}",
|
| 199 |
+
f"{total_target:,}",
|
| 200 |
+
source_name,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
logger.info(
|
| 204 |
+
"Interleave done | split=%s documents=%s total_tokens=%s",
|
| 205 |
+
split,
|
| 206 |
+
f"{emitted_documents:,}",
|
| 207 |
+
f"{total_written:,}",
|
| 208 |
+
)
|
| 209 |
+
return total_written
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def main() -> None:
|
| 213 |
+
args = build_parser().parse_args()
|
| 214 |
+
data_config = DataMixConfig.from_dict(load_json(args.data_config))
|
| 215 |
+
tokenizer, tokenizer_meta = load_tokenizer(args.tokenizer_dir)
|
| 216 |
+
output_dir = Path(args.output_dir)
|
| 217 |
+
train_dir = output_dir / "train"
|
| 218 |
+
val_dir = output_dir / "val"
|
| 219 |
+
train_dir.mkdir(parents=True, exist_ok=True)
|
| 220 |
+
val_dir.mkdir(parents=True, exist_ok=True)
|
| 221 |
+
logger, log_path = setup_logger("sllm.prepare_pretrain_data", output_dir, "prepare_pretrain_data")
|
| 222 |
+
logger.info("Pretokenization started")
|
| 223 |
+
logger.info("Log file: %s", log_path)
|
| 224 |
+
logger.info("Arguments | data_config=%s tokenizer_dir=%s output_dir=%s seed=%s", args.data_config, args.tokenizer_dir, args.output_dir, args.seed)
|
| 225 |
+
logger.info("Tokenizer meta | vocab_size=%s special_tokens=%s", tokenizer_meta.get("vocab_size"), tokenizer_meta.get("special_tokens"))
|
| 226 |
+
logger.info("Mixing strategy | global interleaving with weighted progress balancing")
|
| 227 |
+
logger.info("Tokenization strategy | encode_batch with batch_size=%s", TOKENIZE_BATCH_SIZE)
|
| 228 |
+
|
| 229 |
+
weight_map = data_config.normalized_weights()
|
| 230 |
+
train_targets = allocate_token_targets(data_config, data_config.train_tokens)
|
| 231 |
+
val_targets = allocate_token_targets(data_config, data_config.val_tokens)
|
| 232 |
+
dataset_summary: dict[str, dict] = {}
|
| 233 |
+
states: dict[str, dict] = {}
|
| 234 |
+
|
| 235 |
+
for index, source in enumerate(data_config.sources):
|
| 236 |
+
states[source.name] = make_source_state(source, args.seed + index)
|
| 237 |
+
logger.info(
|
| 238 |
+
"Source registered | name=%s path=%s data_dir=%s split=%s text_field=%s weight=%.4f train_target=%s val_target=%s streaming=%s",
|
| 239 |
+
source.name,
|
| 240 |
+
source.path,
|
| 241 |
+
source.data_dir,
|
| 242 |
+
source.split,
|
| 243 |
+
source.text_field,
|
| 244 |
+
weight_map[source.name],
|
| 245 |
+
f"{train_targets[source.name]:,}",
|
| 246 |
+
f"{val_targets[source.name]:,}",
|
| 247 |
+
source.streaming,
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
rng_val = random.Random(args.seed + 10_000)
|
| 251 |
+
rng_train = random.Random(args.seed + 20_000)
|
| 252 |
+
val_writer = TokenShardWriter(
|
| 253 |
+
output_dir=val_dir,
|
| 254 |
+
prefix="val",
|
| 255 |
+
shard_size_tokens=max(1_000_000, min(data_config.shard_size_tokens, data_config.val_tokens)),
|
| 256 |
+
)
|
| 257 |
+
train_writer = TokenShardWriter(
|
| 258 |
+
output_dir=train_dir,
|
| 259 |
+
prefix="train",
|
| 260 |
+
shard_size_tokens=data_config.shard_size_tokens,
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
total_val = interleave_split("val", val_writer, states, val_targets, tokenizer, logger, rng_val)
|
| 264 |
+
total_train = interleave_split("train", train_writer, states, train_targets, tokenizer, logger, rng_train)
|
| 265 |
+
|
| 266 |
+
train_shards = train_writer.finalize()
|
| 267 |
+
val_shards = val_writer.finalize()
|
| 268 |
+
|
| 269 |
+
for source in data_config.sources:
|
| 270 |
+
state = states[source.name]
|
| 271 |
+
dataset_summary[source.name] = {
|
| 272 |
+
"path": source.path,
|
| 273 |
+
"data_dir": source.data_dir,
|
| 274 |
+
"split": source.split,
|
| 275 |
+
"train_target_tokens": train_targets[source.name],
|
| 276 |
+
"val_target_tokens": val_targets[source.name],
|
| 277 |
+
"train_tokens_written": state["train_tokens_written"],
|
| 278 |
+
"val_tokens_written": state["val_tokens_written"],
|
| 279 |
+
"documents_used": state["documents_used"],
|
| 280 |
+
}
|
| 281 |
+
logger.info(
|
| 282 |
+
"Source done | name=%s documents=%s train_tokens=%s/%s val_tokens=%s/%s",
|
| 283 |
+
source.name,
|
| 284 |
+
f"{state['documents_used']:,}",
|
| 285 |
+
f"{state['train_tokens_written']:,}",
|
| 286 |
+
f"{train_targets[source.name]:,}",
|
| 287 |
+
f"{state['val_tokens_written']:,}",
|
| 288 |
+
f"{val_targets[source.name]:,}",
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
save_json(
|
| 292 |
+
output_dir / "dataset_summary.json",
|
| 293 |
+
{
|
| 294 |
+
"tokenizer": tokenizer_meta,
|
| 295 |
+
"data_config": data_config.to_dict(),
|
| 296 |
+
"mixing_strategy": "global_interleaving_weighted_progress_balancing",
|
| 297 |
+
"train_target_tokens": data_config.train_tokens,
|
| 298 |
+
"val_target_tokens": data_config.val_tokens,
|
| 299 |
+
"train_tokens_written": total_train,
|
| 300 |
+
"val_tokens_written": total_val,
|
| 301 |
+
"train_shards": len(train_shards),
|
| 302 |
+
"val_shards": len(val_shards),
|
| 303 |
+
"sources": dataset_summary,
|
| 304 |
+
},
|
| 305 |
+
)
|
| 306 |
+
logger.info(
|
| 307 |
+
"Pretokenization finished | output_dir=%s total_train_tokens=%s total_val_tokens=%s train_shards=%s val_shards=%s",
|
| 308 |
+
output_dir,
|
| 309 |
+
f"{total_train:,}",
|
| 310 |
+
f"{total_val:,}",
|
| 311 |
+
len(train_shards),
|
| 312 |
+
len(val_shards),
|
| 313 |
+
)
|
| 314 |
+
logger.info("Dataset summary saved | path=%s", output_dir / "dataset_summary.json")
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
if __name__ == "__main__":
|
| 318 |
+
main()
|
scripts/prepare_sft_data.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from datasets import load_dataset
|
| 8 |
+
from tokenizers import Tokenizer
|
| 9 |
+
|
| 10 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
+
sys.path.append(str(ROOT / "src"))
|
| 12 |
+
|
| 13 |
+
from sllm.config import load_json, save_json
|
| 14 |
+
from sllm.data import SFTShardWriter
|
| 15 |
+
from sllm.utils import setup_logger
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 19 |
+
parser = argparse.ArgumentParser(description="Prepare fixed-length SFT tensors.")
|
| 20 |
+
parser.add_argument("--config", required=True, help="Path to SFT data JSON config.")
|
| 21 |
+
parser.add_argument("--tokenizer-dir", required=True, help="Directory with tokenizer.json and metadata.")
|
| 22 |
+
parser.add_argument("--output-dir", required=True, help="Directory to store processed SFT tensors.")
|
| 23 |
+
parser.add_argument("--seq-len", type=int, default=2_048, help="Packed example length.")
|
| 24 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed for dataset shuffling.")
|
| 25 |
+
return parser
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_tokenizer(tokenizer_dir: str | Path) -> tuple[Tokenizer, dict]:
|
| 29 |
+
tokenizer_dir = Path(tokenizer_dir)
|
| 30 |
+
tokenizer = Tokenizer.from_file(str(tokenizer_dir / "tokenizer.json"))
|
| 31 |
+
metadata = load_json(tokenizer_dir / "tokenizer_meta.json")
|
| 32 |
+
return tokenizer, metadata
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def row_to_messages(row: dict, config: dict) -> list[dict[str, str]]:
|
| 36 |
+
fmt = config.get("format", "messages")
|
| 37 |
+
if fmt == "messages":
|
| 38 |
+
messages = row.get(config.get("messages_field", "messages"))
|
| 39 |
+
if not isinstance(messages, list):
|
| 40 |
+
raise ValueError("Не найден список сообщений в SFT-датасете.")
|
| 41 |
+
normalized = []
|
| 42 |
+
for message in messages:
|
| 43 |
+
if not isinstance(message, dict):
|
| 44 |
+
continue
|
| 45 |
+
role = message.get("role")
|
| 46 |
+
content = message.get("content")
|
| 47 |
+
if isinstance(content, list):
|
| 48 |
+
parts = [item.get("text", "") for item in content if isinstance(item, dict)]
|
| 49 |
+
content = "\n".join(part for part in parts if part)
|
| 50 |
+
if isinstance(role, str) and isinstance(content, str) and content.strip():
|
| 51 |
+
normalized.append({"role": role, "content": content.strip()})
|
| 52 |
+
return normalized
|
| 53 |
+
|
| 54 |
+
if fmt == "prompt_response":
|
| 55 |
+
prompt = row.get(config.get("prompt_field", "prompt"))
|
| 56 |
+
response = row.get(config.get("response_field", "response"))
|
| 57 |
+
if not isinstance(prompt, str) or not isinstance(response, str):
|
| 58 |
+
raise ValueError("Не найдены поля prompt/response в SFT-датасете.")
|
| 59 |
+
system_prompt = config.get("system_prompt")
|
| 60 |
+
messages = []
|
| 61 |
+
if isinstance(system_prompt, str) and system_prompt.strip():
|
| 62 |
+
messages.append({"role": "system", "content": system_prompt.strip()})
|
| 63 |
+
messages.append({"role": "user", "content": prompt.strip()})
|
| 64 |
+
messages.append({"role": "assistant", "content": response.strip()})
|
| 65 |
+
return messages
|
| 66 |
+
|
| 67 |
+
if fmt == "alpaca":
|
| 68 |
+
instruction = row.get(config.get("instruction_field", "instruction"))
|
| 69 |
+
input_text = row.get(config.get("input_field", "input"), "")
|
| 70 |
+
output_text = row.get(config.get("output_field", "output"))
|
| 71 |
+
if not isinstance(instruction, str) or not isinstance(output_text, str):
|
| 72 |
+
raise ValueError("Не найдены поля instruction/output в Alpaca-подобном датасете.")
|
| 73 |
+
prompt = instruction.strip()
|
| 74 |
+
if isinstance(input_text, str) and input_text.strip():
|
| 75 |
+
prompt = f"{prompt}\n\n{input_text.strip()}"
|
| 76 |
+
return [
|
| 77 |
+
{"role": "user", "content": prompt},
|
| 78 |
+
{"role": "assistant", "content": output_text.strip()},
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
raise ValueError(f"Unsupported SFT format: {fmt}")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def tokenize_messages(
|
| 85 |
+
tokenizer: Tokenizer,
|
| 86 |
+
messages: list[dict[str, str]],
|
| 87 |
+
bos_id: int,
|
| 88 |
+
eos_id: int,
|
| 89 |
+
) -> tuple[list[int], list[int]]:
|
| 90 |
+
input_ids = [bos_id]
|
| 91 |
+
labels = [-100]
|
| 92 |
+
|
| 93 |
+
for message in messages:
|
| 94 |
+
role = message["role"].strip().lower()
|
| 95 |
+
content = message["content"].strip()
|
| 96 |
+
if not content:
|
| 97 |
+
continue
|
| 98 |
+
text = f"<|{role}|>\n{content}\n"
|
| 99 |
+
piece = tokenizer.encode(text, add_special_tokens=False).ids
|
| 100 |
+
if not piece:
|
| 101 |
+
continue
|
| 102 |
+
input_ids.extend(piece)
|
| 103 |
+
if role == "assistant":
|
| 104 |
+
labels.extend(piece)
|
| 105 |
+
else:
|
| 106 |
+
labels.extend([-100] * len(piece))
|
| 107 |
+
|
| 108 |
+
input_ids.append(eos_id)
|
| 109 |
+
labels.append(eos_id)
|
| 110 |
+
return input_ids, labels
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def pad_or_truncate(
|
| 114 |
+
input_ids: list[int],
|
| 115 |
+
labels: list[int],
|
| 116 |
+
seq_len: int,
|
| 117 |
+
pad_id: int,
|
| 118 |
+
) -> tuple[list[int], list[int]]:
|
| 119 |
+
input_ids = input_ids[:seq_len]
|
| 120 |
+
labels = labels[:seq_len]
|
| 121 |
+
if len(input_ids) < seq_len:
|
| 122 |
+
pad_length = seq_len - len(input_ids)
|
| 123 |
+
input_ids = input_ids + [pad_id] * pad_length
|
| 124 |
+
labels = labels + [-100] * pad_length
|
| 125 |
+
return input_ids, labels
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def main() -> None:
|
| 129 |
+
args = build_parser().parse_args()
|
| 130 |
+
config = load_json(args.config)
|
| 131 |
+
tokenizer, tokenizer_meta = load_tokenizer(args.tokenizer_dir)
|
| 132 |
+
specials = tokenizer_meta["special_tokens"]
|
| 133 |
+
bos_id = int(specials["bos_token_id"])
|
| 134 |
+
eos_id = int(specials["eos_token_id"])
|
| 135 |
+
pad_id = int(specials["pad_token_id"])
|
| 136 |
+
|
| 137 |
+
dataset = load_dataset(
|
| 138 |
+
path=config["path"],
|
| 139 |
+
name=config.get("config_name"),
|
| 140 |
+
split=config.get("split", "train"),
|
| 141 |
+
revision=config.get("revision"),
|
| 142 |
+
streaming=bool(config.get("streaming", False)),
|
| 143 |
+
)
|
| 144 |
+
if config.get("shuffle", True):
|
| 145 |
+
dataset = dataset.shuffle(seed=args.seed)
|
| 146 |
+
|
| 147 |
+
val_examples = int(config.get("val_examples", 1_000))
|
| 148 |
+
output_dir = Path(args.output_dir)
|
| 149 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 150 |
+
logger, log_path = setup_logger("sllm.prepare_sft_data", output_dir, "prepare_sft_data")
|
| 151 |
+
logger.info("SFT data preparation started")
|
| 152 |
+
logger.info("Log file: %s", log_path)
|
| 153 |
+
logger.info(
|
| 154 |
+
"Arguments | config=%s tokenizer_dir=%s output_dir=%s seq_len=%s seed=%s",
|
| 155 |
+
args.config,
|
| 156 |
+
args.tokenizer_dir,
|
| 157 |
+
args.output_dir,
|
| 158 |
+
args.seq_len,
|
| 159 |
+
args.seed,
|
| 160 |
+
)
|
| 161 |
+
logger.info(
|
| 162 |
+
"SFT source config | path=%s config_name=%s split=%s format=%s streaming=%s val_examples=%s max_train_examples=%s",
|
| 163 |
+
config.get("path"),
|
| 164 |
+
config.get("config_name"),
|
| 165 |
+
config.get("split", "train"),
|
| 166 |
+
config.get("format", "messages"),
|
| 167 |
+
bool(config.get("streaming", False)),
|
| 168 |
+
val_examples,
|
| 169 |
+
config.get("max_train_examples"),
|
| 170 |
+
)
|
| 171 |
+
train_writer = SFTShardWriter(output_dir, prefix="train", seq_len=args.seq_len)
|
| 172 |
+
val_writer = SFTShardWriter(output_dir, prefix="val", seq_len=args.seq_len)
|
| 173 |
+
|
| 174 |
+
train_count = 0
|
| 175 |
+
val_count = 0
|
| 176 |
+
max_train_examples = config.get("max_train_examples")
|
| 177 |
+
|
| 178 |
+
for row in dataset:
|
| 179 |
+
messages = row_to_messages(row, config)
|
| 180 |
+
if not messages:
|
| 181 |
+
continue
|
| 182 |
+
input_ids, labels = tokenize_messages(tokenizer, messages, bos_id=bos_id, eos_id=eos_id)
|
| 183 |
+
input_ids, labels = pad_or_truncate(input_ids, labels, args.seq_len, pad_id=pad_id)
|
| 184 |
+
|
| 185 |
+
if val_count < val_examples:
|
| 186 |
+
val_writer.add_example(input_ids, labels)
|
| 187 |
+
val_count += 1
|
| 188 |
+
else:
|
| 189 |
+
train_writer.add_example(input_ids, labels)
|
| 190 |
+
train_count += 1
|
| 191 |
+
|
| 192 |
+
total_examples = train_count + val_count
|
| 193 |
+
if total_examples % 5_000 == 0:
|
| 194 |
+
logger.info(
|
| 195 |
+
"SFT progress | processed=%s train_examples=%s val_examples=%s",
|
| 196 |
+
f"{total_examples:,}",
|
| 197 |
+
f"{train_count:,}",
|
| 198 |
+
f"{val_count:,}",
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
if max_train_examples is not None and train_count >= int(max_train_examples):
|
| 202 |
+
break
|
| 203 |
+
|
| 204 |
+
train_metadata = train_writer.finalize()
|
| 205 |
+
val_metadata = val_writer.finalize()
|
| 206 |
+
save_json(
|
| 207 |
+
output_dir / "dataset_summary.json",
|
| 208 |
+
{
|
| 209 |
+
"config": config,
|
| 210 |
+
"tokenizer_meta": tokenizer_meta,
|
| 211 |
+
"train": train_metadata,
|
| 212 |
+
"val": val_metadata,
|
| 213 |
+
},
|
| 214 |
+
)
|
| 215 |
+
logger.info("SFT dataset saved | output_dir=%s", output_dir)
|
| 216 |
+
logger.info("SFT summary | train_examples=%s val_examples=%s", f"{train_count:,}", f"{val_count:,}")
|
| 217 |
+
logger.info("SFT metadata saved | path=%s", output_dir / "dataset_summary.json")
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
if __name__ == "__main__":
|
| 221 |
+
main()
|
scripts/train_pretrain.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import math
|
| 5 |
+
import sys
|
| 6 |
+
import time
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
from torch.utils.data import DataLoader
|
| 11 |
+
|
| 12 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 13 |
+
sys.path.append(str(ROOT / "src"))
|
| 14 |
+
|
| 15 |
+
from sllm.checkpoint import load_checkpoint, save_checkpoint
|
| 16 |
+
from sllm.config import ModelConfig, TrainConfig, load_json, save_json
|
| 17 |
+
from sllm.data import RandomTokenDataset, SequentialEvalDataset
|
| 18 |
+
from sllm.model import SLLMForCausalLM
|
| 19 |
+
from sllm.utils import (
|
| 20 |
+
append_jsonl,
|
| 21 |
+
autocast_context,
|
| 22 |
+
cosine_lr,
|
| 23 |
+
cuda_memory_snapshot,
|
| 24 |
+
ensure_dir,
|
| 25 |
+
format_number,
|
| 26 |
+
get_device,
|
| 27 |
+
iso_timestamp,
|
| 28 |
+
maybe_enable_tf32,
|
| 29 |
+
model_parameter_count,
|
| 30 |
+
resolve_runtime_precision,
|
| 31 |
+
set_optimizer_lr,
|
| 32 |
+
set_seed,
|
| 33 |
+
setup_logger,
|
| 34 |
+
timestamp,
|
| 35 |
+
tokens_per_step,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 40 |
+
parser = argparse.ArgumentParser(description="Pretrain the small causal LM.")
|
| 41 |
+
parser.add_argument("--model-config", required=True, help="Path to model JSON config.")
|
| 42 |
+
parser.add_argument("--train-config", required=True, help="Path to pretraining JSON config.")
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--max-steps",
|
| 45 |
+
type=int,
|
| 46 |
+
default=None,
|
| 47 |
+
help="Optional override for debugging or dry runs.",
|
| 48 |
+
)
|
| 49 |
+
return parser
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def build_optimizer(model: torch.nn.Module, config: TrainConfig, device: torch.device):
|
| 53 |
+
decay_params = []
|
| 54 |
+
no_decay_params = []
|
| 55 |
+
for name, parameter in model.named_parameters():
|
| 56 |
+
if not parameter.requires_grad:
|
| 57 |
+
continue
|
| 58 |
+
if parameter.ndim <= 1 or name.endswith("bias"):
|
| 59 |
+
no_decay_params.append(parameter)
|
| 60 |
+
else:
|
| 61 |
+
decay_params.append(parameter)
|
| 62 |
+
|
| 63 |
+
fused_supported = device.type == "cuda"
|
| 64 |
+
return torch.optim.AdamW(
|
| 65 |
+
[
|
| 66 |
+
{"params": decay_params, "weight_decay": config.weight_decay},
|
| 67 |
+
{"params": no_decay_params, "weight_decay": 0.0},
|
| 68 |
+
],
|
| 69 |
+
lr=config.learning_rate,
|
| 70 |
+
betas=(config.beta1, config.beta2),
|
| 71 |
+
fused=fused_supported,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@torch.no_grad()
|
| 76 |
+
def evaluate(
|
| 77 |
+
model: SLLMForCausalLM,
|
| 78 |
+
config: TrainConfig,
|
| 79 |
+
device: torch.device,
|
| 80 |
+
) -> tuple[float, float]:
|
| 81 |
+
model.eval()
|
| 82 |
+
dataset = SequentialEvalDataset(
|
| 83 |
+
data_dir=config.val_dir,
|
| 84 |
+
split="val",
|
| 85 |
+
seq_len=config.seq_len,
|
| 86 |
+
max_batches=config.eval_batches * config.micro_batch_size,
|
| 87 |
+
)
|
| 88 |
+
loader = DataLoader(dataset, batch_size=config.micro_batch_size, num_workers=0)
|
| 89 |
+
|
| 90 |
+
losses = []
|
| 91 |
+
for batch_index, batch in enumerate(loader):
|
| 92 |
+
if batch_index >= config.eval_batches:
|
| 93 |
+
break
|
| 94 |
+
batch = {key: value.to(device) for key, value in batch.items()}
|
| 95 |
+
with autocast_context(device, config.precision):
|
| 96 |
+
loss = model(**batch)["loss"]
|
| 97 |
+
losses.append(loss.detach().float().item())
|
| 98 |
+
|
| 99 |
+
mean_loss = float(sum(losses) / max(1, len(losses)))
|
| 100 |
+
perplexity = math.exp(min(mean_loss, 20))
|
| 101 |
+
model.train()
|
| 102 |
+
return mean_loss, perplexity
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def maybe_load_weights(
|
| 106 |
+
model: SLLMForCausalLM,
|
| 107 |
+
optimizer: torch.optim.Optimizer,
|
| 108 |
+
config: TrainConfig,
|
| 109 |
+
device: torch.device,
|
| 110 |
+
logger,
|
| 111 |
+
) -> int:
|
| 112 |
+
step = 0
|
| 113 |
+
checkpoint_path = config.resume_from or config.init_from
|
| 114 |
+
if checkpoint_path is None:
|
| 115 |
+
return step
|
| 116 |
+
|
| 117 |
+
payload = load_checkpoint(checkpoint_path, map_location=device)
|
| 118 |
+
model.load_state_dict(payload["model"])
|
| 119 |
+
if config.resume_from and payload.get("optimizer") is not None:
|
| 120 |
+
optimizer.load_state_dict(payload["optimizer"])
|
| 121 |
+
step = int(payload.get("step", 0))
|
| 122 |
+
logger.info("Resumed training | step=%s checkpoint=%s", step, checkpoint_path)
|
| 123 |
+
else:
|
| 124 |
+
logger.info("Loaded model weights | checkpoint=%s", checkpoint_path)
|
| 125 |
+
return step
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def save_run_config(output_dir: Path, model_config: ModelConfig, train_config: TrainConfig) -> None:
|
| 129 |
+
save_json(
|
| 130 |
+
output_dir / "run_config.json",
|
| 131 |
+
{
|
| 132 |
+
"model_config": model_config.to_dict(),
|
| 133 |
+
"train_config": train_config.to_dict(),
|
| 134 |
+
},
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def main() -> None:
|
| 139 |
+
args = build_parser().parse_args()
|
| 140 |
+
model_config = ModelConfig.from_dict(load_json(args.model_config))
|
| 141 |
+
train_config = TrainConfig.from_dict(load_json(args.train_config))
|
| 142 |
+
if args.max_steps is not None:
|
| 143 |
+
train_config.max_steps = args.max_steps
|
| 144 |
+
|
| 145 |
+
set_seed(train_config.seed)
|
| 146 |
+
device = get_device()
|
| 147 |
+
maybe_enable_tf32(device)
|
| 148 |
+
runtime_precision, precision_warning = resolve_runtime_precision(device, train_config.precision)
|
| 149 |
+
train_config.precision = runtime_precision
|
| 150 |
+
|
| 151 |
+
output_dir = ensure_dir(train_config.output_dir)
|
| 152 |
+
checkpoint_dir = ensure_dir(train_config.checkpoint_dir)
|
| 153 |
+
logger, log_path = setup_logger("sllm.train_pretrain", output_dir, "train_pretrain")
|
| 154 |
+
metrics_path = Path(output_dir) / "logs" / f"{log_path.stem}.jsonl"
|
| 155 |
+
logger.info("Pretraining started")
|
| 156 |
+
logger.info("Log file: %s", log_path)
|
| 157 |
+
logger.info("Metrics JSONL: %s", metrics_path)
|
| 158 |
+
logger.info("Arguments | model_config=%s train_config=%s max_steps_override=%s", args.model_config, args.train_config, args.max_steps)
|
| 159 |
+
if precision_warning is not None:
|
| 160 |
+
logger.warning(precision_warning)
|
| 161 |
+
logger.info("Model config | %s", model_config.to_dict())
|
| 162 |
+
logger.info("Train config | %s", train_config.to_dict())
|
| 163 |
+
append_jsonl(
|
| 164 |
+
metrics_path,
|
| 165 |
+
{
|
| 166 |
+
"event": "run_started",
|
| 167 |
+
"timestamp": iso_timestamp(),
|
| 168 |
+
"log_path": str(log_path),
|
| 169 |
+
"metrics_path": str(metrics_path),
|
| 170 |
+
"model_config": model_config.to_dict(),
|
| 171 |
+
"train_config": train_config.to_dict(),
|
| 172 |
+
"args": {
|
| 173 |
+
"model_config": args.model_config,
|
| 174 |
+
"train_config": args.train_config,
|
| 175 |
+
"max_steps_override": args.max_steps,
|
| 176 |
+
},
|
| 177 |
+
},
|
| 178 |
+
)
|
| 179 |
+
save_run_config(output_dir, model_config, train_config)
|
| 180 |
+
|
| 181 |
+
dataset = RandomTokenDataset(
|
| 182 |
+
data_dir=train_config.train_dir,
|
| 183 |
+
split="train",
|
| 184 |
+
seq_len=train_config.seq_len,
|
| 185 |
+
seed=train_config.seed,
|
| 186 |
+
)
|
| 187 |
+
loader = DataLoader(
|
| 188 |
+
dataset,
|
| 189 |
+
batch_size=train_config.micro_batch_size,
|
| 190 |
+
num_workers=train_config.num_workers,
|
| 191 |
+
pin_memory=device.type == "cuda",
|
| 192 |
+
)
|
| 193 |
+
data_iter = iter(loader)
|
| 194 |
+
|
| 195 |
+
model = SLLMForCausalLM(model_config).to(device)
|
| 196 |
+
if train_config.compile_model and hasattr(torch, "compile"):
|
| 197 |
+
model = torch.compile(model) # type: ignore[assignment]
|
| 198 |
+
|
| 199 |
+
optimizer = build_optimizer(model, train_config, device)
|
| 200 |
+
scaler = torch.amp.GradScaler(
|
| 201 |
+
"cuda",
|
| 202 |
+
enabled=device.type == "cuda" and train_config.precision.lower() == "fp16",
|
| 203 |
+
)
|
| 204 |
+
start_step = maybe_load_weights(model, optimizer, train_config, device, logger)
|
| 205 |
+
if start_step > 0:
|
| 206 |
+
append_jsonl(
|
| 207 |
+
metrics_path,
|
| 208 |
+
{
|
| 209 |
+
"event": "resumed",
|
| 210 |
+
"timestamp": iso_timestamp(),
|
| 211 |
+
"step": start_step,
|
| 212 |
+
"checkpoint": train_config.resume_from,
|
| 213 |
+
},
|
| 214 |
+
)
|
| 215 |
+
model.train()
|
| 216 |
+
|
| 217 |
+
tokens_step = tokens_per_step(
|
| 218 |
+
train_config.micro_batch_size,
|
| 219 |
+
train_config.grad_accum_steps,
|
| 220 |
+
train_config.seq_len,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
logger.info("Device summary | device=%s precision=%s compile_model=%s", device, train_config.precision, train_config.compile_model)
|
| 224 |
+
logger.info("Model summary | parameters=%s", format_number(model_parameter_count(model)))
|
| 225 |
+
logger.info(
|
| 226 |
+
"Batch summary | seq_len=%s micro_batch_size=%s grad_accum_steps=%s tokens_per_step=%s",
|
| 227 |
+
train_config.seq_len,
|
| 228 |
+
train_config.micro_batch_size,
|
| 229 |
+
train_config.grad_accum_steps,
|
| 230 |
+
f"{tokens_step:,}",
|
| 231 |
+
)
|
| 232 |
+
logger.info("Dataset summary | train_dir=%s val_dir=%s num_train_shards=%s", train_config.train_dir, train_config.val_dir, len(dataset.shards))
|
| 233 |
+
append_jsonl(
|
| 234 |
+
metrics_path,
|
| 235 |
+
{
|
| 236 |
+
"event": "runtime_summary",
|
| 237 |
+
"timestamp": iso_timestamp(),
|
| 238 |
+
"device": str(device),
|
| 239 |
+
"precision": train_config.precision,
|
| 240 |
+
"compile_model": train_config.compile_model,
|
| 241 |
+
"parameters": model_parameter_count(model),
|
| 242 |
+
"seq_len": train_config.seq_len,
|
| 243 |
+
"micro_batch_size": train_config.micro_batch_size,
|
| 244 |
+
"grad_accum_steps": train_config.grad_accum_steps,
|
| 245 |
+
"tokens_per_step": tokens_step,
|
| 246 |
+
"num_train_shards": len(dataset.shards),
|
| 247 |
+
"train_dir": train_config.train_dir,
|
| 248 |
+
"val_dir": train_config.val_dir,
|
| 249 |
+
},
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
running_loss = 0.0
|
| 253 |
+
log_start_time = time.perf_counter()
|
| 254 |
+
last_grad_norm = float("nan")
|
| 255 |
+
|
| 256 |
+
for step in range(start_step, train_config.max_steps):
|
| 257 |
+
lr = cosine_lr(
|
| 258 |
+
step=step,
|
| 259 |
+
warmup_steps=train_config.warmup_steps,
|
| 260 |
+
max_steps=train_config.max_steps,
|
| 261 |
+
max_lr=train_config.learning_rate,
|
| 262 |
+
min_lr=train_config.min_lr,
|
| 263 |
+
)
|
| 264 |
+
set_optimizer_lr(optimizer, lr)
|
| 265 |
+
optimizer.zero_grad(set_to_none=True)
|
| 266 |
+
|
| 267 |
+
step_loss = 0.0
|
| 268 |
+
for micro_step in range(train_config.grad_accum_steps):
|
| 269 |
+
batch = next(data_iter)
|
| 270 |
+
batch = {key: value.to(device, non_blocking=device.type == "cuda") for key, value in batch.items()}
|
| 271 |
+
|
| 272 |
+
with autocast_context(device, train_config.precision):
|
| 273 |
+
loss = model(**batch)["loss"] / train_config.grad_accum_steps
|
| 274 |
+
|
| 275 |
+
step_loss += loss.detach().float().item()
|
| 276 |
+
if scaler.is_enabled():
|
| 277 |
+
scaler.scale(loss).backward()
|
| 278 |
+
else:
|
| 279 |
+
loss.backward()
|
| 280 |
+
|
| 281 |
+
if train_config.grad_clip is not None and train_config.grad_clip > 0:
|
| 282 |
+
if scaler.is_enabled():
|
| 283 |
+
scaler.unscale_(optimizer)
|
| 284 |
+
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.grad_clip)
|
| 285 |
+
last_grad_norm = float(grad_norm)
|
| 286 |
+
|
| 287 |
+
if scaler.is_enabled():
|
| 288 |
+
scaler.step(optimizer)
|
| 289 |
+
scaler.update()
|
| 290 |
+
else:
|
| 291 |
+
optimizer.step()
|
| 292 |
+
|
| 293 |
+
running_loss += step_loss
|
| 294 |
+
|
| 295 |
+
if (step + 1) % train_config.log_interval == 0:
|
| 296 |
+
elapsed = time.perf_counter() - log_start_time
|
| 297 |
+
avg_loss = running_loss / train_config.log_interval
|
| 298 |
+
tok_per_sec = (tokens_step * train_config.log_interval) / max(elapsed, 1e-6)
|
| 299 |
+
memory = cuda_memory_snapshot(device)
|
| 300 |
+
memory_suffix = ""
|
| 301 |
+
if memory:
|
| 302 |
+
memory_suffix = (
|
| 303 |
+
f" mem_alloc_gb={memory['allocated_gb']:.2f}"
|
| 304 |
+
f" mem_reserved_gb={memory['reserved_gb']:.2f}"
|
| 305 |
+
f" max_mem_alloc_gb={memory['max_allocated_gb']:.2f}"
|
| 306 |
+
f" max_mem_reserved_gb={memory['max_reserved_gb']:.2f}"
|
| 307 |
+
)
|
| 308 |
+
logger.info(
|
| 309 |
+
"Train step | step=%s loss=%.4f lr=%.6f tok_per_sec=%s grad_norm=%.4f tokens_seen=%s%s",
|
| 310 |
+
step + 1,
|
| 311 |
+
avg_loss,
|
| 312 |
+
lr,
|
| 313 |
+
f"{tok_per_sec:,.0f}",
|
| 314 |
+
last_grad_norm,
|
| 315 |
+
format_number((step + 1) * tokens_step),
|
| 316 |
+
memory_suffix,
|
| 317 |
+
)
|
| 318 |
+
append_jsonl(
|
| 319 |
+
metrics_path,
|
| 320 |
+
{
|
| 321 |
+
"event": "train",
|
| 322 |
+
"timestamp": iso_timestamp(),
|
| 323 |
+
"step": step + 1,
|
| 324 |
+
"loss": avg_loss,
|
| 325 |
+
"lr": lr,
|
| 326 |
+
"tok_per_sec": tok_per_sec,
|
| 327 |
+
"grad_norm": last_grad_norm,
|
| 328 |
+
"tokens_seen": (step + 1) * tokens_step,
|
| 329 |
+
"elapsed_sec": elapsed,
|
| 330 |
+
"seq_len": train_config.seq_len,
|
| 331 |
+
"micro_batch_size": train_config.micro_batch_size,
|
| 332 |
+
"grad_accum_steps": train_config.grad_accum_steps,
|
| 333 |
+
**memory,
|
| 334 |
+
},
|
| 335 |
+
)
|
| 336 |
+
running_loss = 0.0
|
| 337 |
+
log_start_time = time.perf_counter()
|
| 338 |
+
|
| 339 |
+
if (step + 1) % train_config.eval_interval == 0:
|
| 340 |
+
val_loss, perplexity = evaluate(model, train_config, device)
|
| 341 |
+
logger.info("Eval step | step=%s val_loss=%.4f perplexity=%.2f", step + 1, val_loss, perplexity)
|
| 342 |
+
append_jsonl(
|
| 343 |
+
metrics_path,
|
| 344 |
+
{
|
| 345 |
+
"event": "eval",
|
| 346 |
+
"timestamp": iso_timestamp(),
|
| 347 |
+
"step": step + 1,
|
| 348 |
+
"val_loss": val_loss,
|
| 349 |
+
"perplexity": perplexity,
|
| 350 |
+
"eval_batches": train_config.eval_batches,
|
| 351 |
+
},
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
if (step + 1) % train_config.save_interval == 0 or (step + 1) == train_config.max_steps:
|
| 355 |
+
step_checkpoint_path = checkpoint_dir / f"step_{step + 1:07d}.pt"
|
| 356 |
+
last_checkpoint_path = checkpoint_dir / "last.pt"
|
| 357 |
+
save_checkpoint(
|
| 358 |
+
step_checkpoint_path,
|
| 359 |
+
model=model,
|
| 360 |
+
optimizer=optimizer,
|
| 361 |
+
step=step + 1,
|
| 362 |
+
model_config=model_config.to_dict(),
|
| 363 |
+
train_config=train_config.to_dict(),
|
| 364 |
+
extra_state={"tokens_seen": (step + 1) * tokens_step},
|
| 365 |
+
)
|
| 366 |
+
save_checkpoint(
|
| 367 |
+
last_checkpoint_path,
|
| 368 |
+
model=model,
|
| 369 |
+
optimizer=optimizer,
|
| 370 |
+
step=step + 1,
|
| 371 |
+
model_config=model_config.to_dict(),
|
| 372 |
+
train_config=train_config.to_dict(),
|
| 373 |
+
extra_state={"tokens_seen": (step + 1) * tokens_step},
|
| 374 |
+
)
|
| 375 |
+
logger.info(
|
| 376 |
+
"Checkpoint saved | step=%s step_checkpoint=%s last_checkpoint=%s",
|
| 377 |
+
step + 1,
|
| 378 |
+
step_checkpoint_path,
|
| 379 |
+
last_checkpoint_path,
|
| 380 |
+
)
|
| 381 |
+
append_jsonl(
|
| 382 |
+
metrics_path,
|
| 383 |
+
{
|
| 384 |
+
"event": "checkpoint",
|
| 385 |
+
"timestamp": iso_timestamp(),
|
| 386 |
+
"step": step + 1,
|
| 387 |
+
"step_checkpoint": str(step_checkpoint_path),
|
| 388 |
+
"last_checkpoint": str(last_checkpoint_path),
|
| 389 |
+
"tokens_seen": (step + 1) * tokens_step,
|
| 390 |
+
},
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
append_jsonl(
|
| 394 |
+
metrics_path,
|
| 395 |
+
{
|
| 396 |
+
"event": "run_finished",
|
| 397 |
+
"timestamp": iso_timestamp(),
|
| 398 |
+
"final_step": train_config.max_steps,
|
| 399 |
+
"tokens_seen": train_config.max_steps * tokens_step,
|
| 400 |
+
},
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
if __name__ == "__main__":
|
| 405 |
+
main()
|
scripts/train_sft.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import math
|
| 5 |
+
import sys
|
| 6 |
+
import time
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
import torch
|
| 10 |
+
from torch.utils.data import DataLoader
|
| 11 |
+
|
| 12 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 13 |
+
sys.path.append(str(ROOT / "src"))
|
| 14 |
+
|
| 15 |
+
from sllm.checkpoint import load_checkpoint, save_checkpoint
|
| 16 |
+
from sllm.config import ModelConfig, SFTConfig, load_json, save_json
|
| 17 |
+
from sllm.data import FixedSFTDataset
|
| 18 |
+
from sllm.model import SLLMForCausalLM
|
| 19 |
+
from sllm.utils import (
|
| 20 |
+
append_jsonl,
|
| 21 |
+
autocast_context,
|
| 22 |
+
cosine_lr,
|
| 23 |
+
cuda_memory_snapshot,
|
| 24 |
+
ensure_dir,
|
| 25 |
+
format_number,
|
| 26 |
+
get_device,
|
| 27 |
+
iso_timestamp,
|
| 28 |
+
maybe_enable_tf32,
|
| 29 |
+
model_parameter_count,
|
| 30 |
+
resolve_runtime_precision,
|
| 31 |
+
set_optimizer_lr,
|
| 32 |
+
set_seed,
|
| 33 |
+
setup_logger,
|
| 34 |
+
timestamp,
|
| 35 |
+
tokens_per_step,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 40 |
+
parser = argparse.ArgumentParser(description="Run supervised fine-tuning for the sLLM.")
|
| 41 |
+
parser.add_argument("--model-config", required=True, help="Path to model JSON config.")
|
| 42 |
+
parser.add_argument("--train-config", required=True, help="Path to SFT JSON config.")
|
| 43 |
+
parser.add_argument("--max-steps", type=int, default=None, help="Optional debug override.")
|
| 44 |
+
return parser
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def build_optimizer(model: torch.nn.Module, config: SFTConfig, device: torch.device):
|
| 48 |
+
decay_params = []
|
| 49 |
+
no_decay_params = []
|
| 50 |
+
for name, parameter in model.named_parameters():
|
| 51 |
+
if not parameter.requires_grad:
|
| 52 |
+
continue
|
| 53 |
+
if parameter.ndim <= 1 or name.endswith("bias"):
|
| 54 |
+
no_decay_params.append(parameter)
|
| 55 |
+
else:
|
| 56 |
+
decay_params.append(parameter)
|
| 57 |
+
return torch.optim.AdamW(
|
| 58 |
+
[
|
| 59 |
+
{"params": decay_params, "weight_decay": config.weight_decay},
|
| 60 |
+
{"params": no_decay_params, "weight_decay": 0.0},
|
| 61 |
+
],
|
| 62 |
+
lr=config.learning_rate,
|
| 63 |
+
betas=(config.beta1, config.beta2),
|
| 64 |
+
fused=device.type == "cuda",
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@torch.no_grad()
|
| 69 |
+
def evaluate(model: SLLMForCausalLM, loader: DataLoader, device: torch.device, precision: str, max_batches: int):
|
| 70 |
+
model.eval()
|
| 71 |
+
losses = []
|
| 72 |
+
for batch_index, batch in enumerate(loader):
|
| 73 |
+
if batch_index >= max_batches:
|
| 74 |
+
break
|
| 75 |
+
batch = {key: value.to(device) for key, value in batch.items()}
|
| 76 |
+
with autocast_context(device, precision):
|
| 77 |
+
loss = model(**batch)["loss"]
|
| 78 |
+
losses.append(loss.detach().float().item())
|
| 79 |
+
model.train()
|
| 80 |
+
mean_loss = float(sum(losses) / max(1, len(losses)))
|
| 81 |
+
return mean_loss, math.exp(min(mean_loss, 20))
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def save_run_config(output_dir: Path, model_config: ModelConfig, train_config: SFTConfig) -> None:
|
| 85 |
+
save_json(
|
| 86 |
+
output_dir / "run_config.json",
|
| 87 |
+
{
|
| 88 |
+
"model_config": model_config.to_dict(),
|
| 89 |
+
"train_config": train_config.to_dict(),
|
| 90 |
+
},
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def main() -> None:
|
| 95 |
+
args = build_parser().parse_args()
|
| 96 |
+
model_config = ModelConfig.from_dict(load_json(args.model_config))
|
| 97 |
+
train_config = SFTConfig.from_dict(load_json(args.train_config))
|
| 98 |
+
if args.max_steps is not None:
|
| 99 |
+
train_config.max_steps = args.max_steps
|
| 100 |
+
|
| 101 |
+
set_seed(train_config.seed)
|
| 102 |
+
device = get_device()
|
| 103 |
+
maybe_enable_tf32(device)
|
| 104 |
+
runtime_precision, precision_warning = resolve_runtime_precision(device, train_config.precision)
|
| 105 |
+
train_config.precision = runtime_precision
|
| 106 |
+
|
| 107 |
+
output_dir = ensure_dir(train_config.output_dir)
|
| 108 |
+
checkpoint_dir = ensure_dir(train_config.checkpoint_dir)
|
| 109 |
+
logger, log_path = setup_logger("sllm.train_sft", output_dir, "train_sft")
|
| 110 |
+
metrics_path = Path(output_dir) / "logs" / f"{log_path.stem}.jsonl"
|
| 111 |
+
logger.info("SFT training started")
|
| 112 |
+
logger.info("Log file: %s", log_path)
|
| 113 |
+
logger.info("Metrics JSONL: %s", metrics_path)
|
| 114 |
+
logger.info("Arguments | model_config=%s train_config=%s max_steps_override=%s", args.model_config, args.train_config, args.max_steps)
|
| 115 |
+
if precision_warning is not None:
|
| 116 |
+
logger.warning(precision_warning)
|
| 117 |
+
logger.info("Model config | %s", model_config.to_dict())
|
| 118 |
+
logger.info("SFT config | %s", train_config.to_dict())
|
| 119 |
+
append_jsonl(
|
| 120 |
+
metrics_path,
|
| 121 |
+
{
|
| 122 |
+
"event": "run_started",
|
| 123 |
+
"timestamp": iso_timestamp(),
|
| 124 |
+
"log_path": str(log_path),
|
| 125 |
+
"metrics_path": str(metrics_path),
|
| 126 |
+
"model_config": model_config.to_dict(),
|
| 127 |
+
"train_config": train_config.to_dict(),
|
| 128 |
+
"args": {
|
| 129 |
+
"model_config": args.model_config,
|
| 130 |
+
"train_config": args.train_config,
|
| 131 |
+
"max_steps_override": args.max_steps,
|
| 132 |
+
},
|
| 133 |
+
},
|
| 134 |
+
)
|
| 135 |
+
save_run_config(output_dir, model_config, train_config)
|
| 136 |
+
|
| 137 |
+
train_dataset = FixedSFTDataset(train_config.dataset_path, split="train")
|
| 138 |
+
val_dataset = FixedSFTDataset(train_config.dataset_path, split="val")
|
| 139 |
+
train_loader = DataLoader(
|
| 140 |
+
train_dataset,
|
| 141 |
+
batch_size=train_config.micro_batch_size,
|
| 142 |
+
shuffle=True,
|
| 143 |
+
num_workers=train_config.num_workers,
|
| 144 |
+
pin_memory=device.type == "cuda",
|
| 145 |
+
)
|
| 146 |
+
val_loader = DataLoader(
|
| 147 |
+
val_dataset,
|
| 148 |
+
batch_size=train_config.micro_batch_size,
|
| 149 |
+
shuffle=False,
|
| 150 |
+
num_workers=0,
|
| 151 |
+
pin_memory=device.type == "cuda",
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
model = SLLMForCausalLM(model_config).to(device)
|
| 155 |
+
if train_config.compile_model and hasattr(torch, "compile"):
|
| 156 |
+
model = torch.compile(model) # type: ignore[assignment]
|
| 157 |
+
|
| 158 |
+
optimizer = build_optimizer(model, train_config, device)
|
| 159 |
+
scaler = torch.amp.GradScaler(
|
| 160 |
+
"cuda",
|
| 161 |
+
enabled=device.type == "cuda" and train_config.precision.lower() == "fp16",
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
start_step = 0
|
| 165 |
+
checkpoint_path = train_config.resume_from or train_config.init_from
|
| 166 |
+
if checkpoint_path:
|
| 167 |
+
payload = load_checkpoint(checkpoint_path, map_location=device)
|
| 168 |
+
model.load_state_dict(payload["model"])
|
| 169 |
+
if train_config.resume_from and payload.get("optimizer") is not None:
|
| 170 |
+
optimizer.load_state_dict(payload["optimizer"])
|
| 171 |
+
start_step = int(payload.get("step", 0))
|
| 172 |
+
logger.info("Resumed SFT | step=%s checkpoint=%s", start_step, checkpoint_path)
|
| 173 |
+
append_jsonl(
|
| 174 |
+
metrics_path,
|
| 175 |
+
{
|
| 176 |
+
"event": "resumed",
|
| 177 |
+
"timestamp": iso_timestamp(),
|
| 178 |
+
"step": start_step,
|
| 179 |
+
"checkpoint": checkpoint_path,
|
| 180 |
+
},
|
| 181 |
+
)
|
| 182 |
+
else:
|
| 183 |
+
logger.info("Loaded initialization weights | checkpoint=%s", checkpoint_path)
|
| 184 |
+
append_jsonl(
|
| 185 |
+
metrics_path,
|
| 186 |
+
{
|
| 187 |
+
"event": "initialized_from_checkpoint",
|
| 188 |
+
"timestamp": iso_timestamp(),
|
| 189 |
+
"checkpoint": checkpoint_path,
|
| 190 |
+
},
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
model.train()
|
| 194 |
+
tokens_step = tokens_per_step(
|
| 195 |
+
train_config.micro_batch_size,
|
| 196 |
+
train_config.grad_accum_steps,
|
| 197 |
+
train_config.seq_len,
|
| 198 |
+
)
|
| 199 |
+
logger.info("Device summary | device=%s precision=%s compile_model=%s", device, train_config.precision, train_config.compile_model)
|
| 200 |
+
logger.info("Model summary | parameters=%s", format_number(model_parameter_count(model)))
|
| 201 |
+
logger.info(
|
| 202 |
+
"Batch summary | seq_len=%s micro_batch_size=%s grad_accum_steps=%s tokens_per_step=%s",
|
| 203 |
+
train_config.seq_len,
|
| 204 |
+
train_config.micro_batch_size,
|
| 205 |
+
train_config.grad_accum_steps,
|
| 206 |
+
f"{tokens_step:,}",
|
| 207 |
+
)
|
| 208 |
+
logger.info(
|
| 209 |
+
"Dataset summary | dataset_path=%s train_examples=%s val_examples=%s",
|
| 210 |
+
train_config.dataset_path,
|
| 211 |
+
len(train_dataset),
|
| 212 |
+
len(val_dataset),
|
| 213 |
+
)
|
| 214 |
+
append_jsonl(
|
| 215 |
+
metrics_path,
|
| 216 |
+
{
|
| 217 |
+
"event": "runtime_summary",
|
| 218 |
+
"timestamp": iso_timestamp(),
|
| 219 |
+
"device": str(device),
|
| 220 |
+
"precision": train_config.precision,
|
| 221 |
+
"compile_model": train_config.compile_model,
|
| 222 |
+
"parameters": model_parameter_count(model),
|
| 223 |
+
"seq_len": train_config.seq_len,
|
| 224 |
+
"micro_batch_size": train_config.micro_batch_size,
|
| 225 |
+
"grad_accum_steps": train_config.grad_accum_steps,
|
| 226 |
+
"tokens_per_step": tokens_step,
|
| 227 |
+
"dataset_path": train_config.dataset_path,
|
| 228 |
+
"train_examples": len(train_dataset),
|
| 229 |
+
"val_examples": len(val_dataset),
|
| 230 |
+
},
|
| 231 |
+
)
|
| 232 |
+
running_loss = 0.0
|
| 233 |
+
log_start_time = time.perf_counter()
|
| 234 |
+
train_iterator = iter(train_loader)
|
| 235 |
+
last_grad_norm = float("nan")
|
| 236 |
+
|
| 237 |
+
for step in range(start_step, train_config.max_steps):
|
| 238 |
+
lr = cosine_lr(
|
| 239 |
+
step=step,
|
| 240 |
+
warmup_steps=train_config.warmup_steps,
|
| 241 |
+
max_steps=train_config.max_steps,
|
| 242 |
+
max_lr=train_config.learning_rate,
|
| 243 |
+
min_lr=train_config.min_lr,
|
| 244 |
+
)
|
| 245 |
+
set_optimizer_lr(optimizer, lr)
|
| 246 |
+
optimizer.zero_grad(set_to_none=True)
|
| 247 |
+
|
| 248 |
+
step_loss = 0.0
|
| 249 |
+
for _ in range(train_config.grad_accum_steps):
|
| 250 |
+
try:
|
| 251 |
+
batch = next(train_iterator)
|
| 252 |
+
except StopIteration:
|
| 253 |
+
train_iterator = iter(train_loader)
|
| 254 |
+
batch = next(train_iterator)
|
| 255 |
+
|
| 256 |
+
batch = {key: value.to(device, non_blocking=device.type == "cuda") for key, value in batch.items()}
|
| 257 |
+
with autocast_context(device, train_config.precision):
|
| 258 |
+
loss = model(**batch)["loss"] / train_config.grad_accum_steps
|
| 259 |
+
step_loss += loss.detach().float().item()
|
| 260 |
+
if scaler.is_enabled():
|
| 261 |
+
scaler.scale(loss).backward()
|
| 262 |
+
else:
|
| 263 |
+
loss.backward()
|
| 264 |
+
|
| 265 |
+
if train_config.grad_clip and train_config.grad_clip > 0:
|
| 266 |
+
if scaler.is_enabled():
|
| 267 |
+
scaler.unscale_(optimizer)
|
| 268 |
+
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.grad_clip)
|
| 269 |
+
last_grad_norm = float(grad_norm)
|
| 270 |
+
|
| 271 |
+
if scaler.is_enabled():
|
| 272 |
+
scaler.step(optimizer)
|
| 273 |
+
scaler.update()
|
| 274 |
+
else:
|
| 275 |
+
optimizer.step()
|
| 276 |
+
|
| 277 |
+
running_loss += step_loss
|
| 278 |
+
|
| 279 |
+
if (step + 1) % train_config.log_interval == 0:
|
| 280 |
+
elapsed = time.perf_counter() - log_start_time
|
| 281 |
+
avg_loss = running_loss / train_config.log_interval
|
| 282 |
+
tok_per_sec = (tokens_step * train_config.log_interval) / max(elapsed, 1e-6)
|
| 283 |
+
memory = cuda_memory_snapshot(device)
|
| 284 |
+
memory_suffix = ""
|
| 285 |
+
if memory:
|
| 286 |
+
memory_suffix = (
|
| 287 |
+
f" mem_alloc_gb={memory['allocated_gb']:.2f}"
|
| 288 |
+
f" mem_reserved_gb={memory['reserved_gb']:.2f}"
|
| 289 |
+
f" max_mem_alloc_gb={memory['max_allocated_gb']:.2f}"
|
| 290 |
+
f" max_mem_reserved_gb={memory['max_reserved_gb']:.2f}"
|
| 291 |
+
)
|
| 292 |
+
logger.info(
|
| 293 |
+
"Train step | step=%s loss=%.4f lr=%.6f tok_per_sec=%s grad_norm=%.4f%s",
|
| 294 |
+
step + 1,
|
| 295 |
+
avg_loss,
|
| 296 |
+
lr,
|
| 297 |
+
f"{tok_per_sec:,.0f}",
|
| 298 |
+
last_grad_norm,
|
| 299 |
+
memory_suffix,
|
| 300 |
+
)
|
| 301 |
+
append_jsonl(
|
| 302 |
+
metrics_path,
|
| 303 |
+
{
|
| 304 |
+
"event": "train",
|
| 305 |
+
"timestamp": iso_timestamp(),
|
| 306 |
+
"step": step + 1,
|
| 307 |
+
"loss": avg_loss,
|
| 308 |
+
"lr": lr,
|
| 309 |
+
"tok_per_sec": tok_per_sec,
|
| 310 |
+
"grad_norm": last_grad_norm,
|
| 311 |
+
"tokens_seen": (step + 1) * tokens_step,
|
| 312 |
+
"elapsed_sec": elapsed,
|
| 313 |
+
"seq_len": train_config.seq_len,
|
| 314 |
+
"micro_batch_size": train_config.micro_batch_size,
|
| 315 |
+
"grad_accum_steps": train_config.grad_accum_steps,
|
| 316 |
+
**memory,
|
| 317 |
+
},
|
| 318 |
+
)
|
| 319 |
+
running_loss = 0.0
|
| 320 |
+
log_start_time = time.perf_counter()
|
| 321 |
+
|
| 322 |
+
if (step + 1) % train_config.eval_interval == 0:
|
| 323 |
+
val_loss, val_ppl = evaluate(
|
| 324 |
+
model=model,
|
| 325 |
+
loader=val_loader,
|
| 326 |
+
device=device,
|
| 327 |
+
precision=train_config.precision,
|
| 328 |
+
max_batches=train_config.eval_batches,
|
| 329 |
+
)
|
| 330 |
+
logger.info("Eval step | step=%s val_loss=%.4f perplexity=%.2f", step + 1, val_loss, val_ppl)
|
| 331 |
+
append_jsonl(
|
| 332 |
+
metrics_path,
|
| 333 |
+
{
|
| 334 |
+
"event": "eval",
|
| 335 |
+
"timestamp": iso_timestamp(),
|
| 336 |
+
"step": step + 1,
|
| 337 |
+
"val_loss": val_loss,
|
| 338 |
+
"perplexity": val_ppl,
|
| 339 |
+
"eval_batches": train_config.eval_batches,
|
| 340 |
+
},
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
if (step + 1) % train_config.save_interval == 0 or (step + 1) == train_config.max_steps:
|
| 344 |
+
step_checkpoint_path = checkpoint_dir / f"step_{step + 1:07d}.pt"
|
| 345 |
+
last_checkpoint_path = checkpoint_dir / "last.pt"
|
| 346 |
+
save_checkpoint(
|
| 347 |
+
step_checkpoint_path,
|
| 348 |
+
model=model,
|
| 349 |
+
optimizer=optimizer,
|
| 350 |
+
step=step + 1,
|
| 351 |
+
model_config=model_config.to_dict(),
|
| 352 |
+
train_config=train_config.to_dict(),
|
| 353 |
+
extra_state={"tokens_seen": (step + 1) * tokens_step},
|
| 354 |
+
)
|
| 355 |
+
save_checkpoint(
|
| 356 |
+
last_checkpoint_path,
|
| 357 |
+
model=model,
|
| 358 |
+
optimizer=optimizer,
|
| 359 |
+
step=step + 1,
|
| 360 |
+
model_config=model_config.to_dict(),
|
| 361 |
+
train_config=train_config.to_dict(),
|
| 362 |
+
extra_state={"tokens_seen": (step + 1) * tokens_step},
|
| 363 |
+
)
|
| 364 |
+
logger.info(
|
| 365 |
+
"Checkpoint saved | step=%s step_checkpoint=%s last_checkpoint=%s",
|
| 366 |
+
step + 1,
|
| 367 |
+
step_checkpoint_path,
|
| 368 |
+
last_checkpoint_path,
|
| 369 |
+
)
|
| 370 |
+
append_jsonl(
|
| 371 |
+
metrics_path,
|
| 372 |
+
{
|
| 373 |
+
"event": "checkpoint",
|
| 374 |
+
"timestamp": iso_timestamp(),
|
| 375 |
+
"step": step + 1,
|
| 376 |
+
"step_checkpoint": str(step_checkpoint_path),
|
| 377 |
+
"last_checkpoint": str(last_checkpoint_path),
|
| 378 |
+
"tokens_seen": (step + 1) * tokens_step,
|
| 379 |
+
},
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
append_jsonl(
|
| 383 |
+
metrics_path,
|
| 384 |
+
{
|
| 385 |
+
"event": "run_finished",
|
| 386 |
+
"timestamp": iso_timestamp(),
|
| 387 |
+
"final_step": train_config.max_steps,
|
| 388 |
+
"tokens_seen": train_config.max_steps * tokens_step,
|
| 389 |
+
},
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
if __name__ == "__main__":
|
| 394 |
+
main()
|
scripts/train_tokenizer.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Iterator
|
| 8 |
+
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
|
| 11 |
+
|
| 12 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 13 |
+
sys.path.append(str(ROOT / "src"))
|
| 14 |
+
|
| 15 |
+
from sllm.config import DataMixConfig, load_json, save_json
|
| 16 |
+
from sllm.utils import setup_logger
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 20 |
+
parser = argparse.ArgumentParser(description="Train a BPE tokenizer for the sLLM pipeline.")
|
| 21 |
+
parser.add_argument("--data-config", required=True, help="Path to data mixture JSON config.")
|
| 22 |
+
parser.add_argument("--output-dir", required=True, help="Directory where tokenizer files will be stored.")
|
| 23 |
+
parser.add_argument("--vocab-size", type=int, default=49_152, help="Target tokenizer vocabulary size.")
|
| 24 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed for dataset shuffling.")
|
| 25 |
+
return parser
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def iter_source_texts(source, seed: int, limit: int) -> Iterator[str]:
|
| 29 |
+
dataset = load_dataset(
|
| 30 |
+
path=source.path,
|
| 31 |
+
name=source.config_name,
|
| 32 |
+
data_dir=source.data_dir,
|
| 33 |
+
split=source.split,
|
| 34 |
+
revision=source.revision,
|
| 35 |
+
streaming=source.streaming,
|
| 36 |
+
)
|
| 37 |
+
if source.streaming:
|
| 38 |
+
dataset = dataset.shuffle(seed=seed, buffer_size=source.shuffle_buffer)
|
| 39 |
+
|
| 40 |
+
yielded = 0
|
| 41 |
+
for row in dataset:
|
| 42 |
+
text = row.get(source.text_field or "", None)
|
| 43 |
+
if not isinstance(text, str):
|
| 44 |
+
continue
|
| 45 |
+
text = text.strip()
|
| 46 |
+
if not text:
|
| 47 |
+
continue
|
| 48 |
+
yield text
|
| 49 |
+
yielded += 1
|
| 50 |
+
if yielded >= limit:
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def mixed_iterator(config: DataMixConfig, seed: int, logger) -> Iterator[str]:
|
| 55 |
+
weight_map = config.normalized_weights()
|
| 56 |
+
total_docs = config.tokenizer_sample_documents
|
| 57 |
+
per_source = {
|
| 58 |
+
source.name: max(1, int(total_docs * weight_map[source.name]))
|
| 59 |
+
for source in config.sources
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
for index, source in enumerate(config.sources):
|
| 63 |
+
limit = source.sample_documents or per_source[source.name]
|
| 64 |
+
logger.info(
|
| 65 |
+
"Tokenizer source start | name=%s path=%s data_dir=%s split=%s text_field=%s limit_docs=%s streaming=%s",
|
| 66 |
+
source.name,
|
| 67 |
+
source.path,
|
| 68 |
+
source.data_dir,
|
| 69 |
+
source.split,
|
| 70 |
+
source.text_field,
|
| 71 |
+
f"{limit:,}",
|
| 72 |
+
source.streaming,
|
| 73 |
+
)
|
| 74 |
+
yield from iter_source_texts(source, seed + index, limit)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def main() -> None:
|
| 78 |
+
args = build_parser().parse_args()
|
| 79 |
+
data_config = DataMixConfig.from_dict(load_json(args.data_config))
|
| 80 |
+
output_dir = Path(args.output_dir)
|
| 81 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 82 |
+
logger, log_path = setup_logger("sllm.train_tokenizer", output_dir, "train_tokenizer")
|
| 83 |
+
logger.info("Tokenizer training started")
|
| 84 |
+
logger.info("Log file: %s", log_path)
|
| 85 |
+
logger.info("Arguments | data_config=%s output_dir=%s vocab_size=%s seed=%s", args.data_config, args.output_dir, args.vocab_size, args.seed)
|
| 86 |
+
logger.info("Tokenizer config | sample_documents=%s min_frequency=%s special_tokens=%s num_sources=%s", f"{data_config.tokenizer_sample_documents:,}", data_config.tokenizer_min_frequency, data_config.tokenizer_special_tokens, len(data_config.sources))
|
| 87 |
+
|
| 88 |
+
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
|
| 89 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
| 90 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 91 |
+
trainer = trainers.BpeTrainer(
|
| 92 |
+
vocab_size=args.vocab_size,
|
| 93 |
+
min_frequency=data_config.tokenizer_min_frequency,
|
| 94 |
+
special_tokens=data_config.tokenizer_special_tokens,
|
| 95 |
+
show_progress=True,
|
| 96 |
+
)
|
| 97 |
+
tokenizer.train_from_iterator(mixed_iterator(data_config, args.seed, logger), trainer=trainer)
|
| 98 |
+
|
| 99 |
+
bos_id = tokenizer.token_to_id("<bos>")
|
| 100 |
+
eos_id = tokenizer.token_to_id("<eos>")
|
| 101 |
+
pad_id = tokenizer.token_to_id("<pad>")
|
| 102 |
+
if bos_id is None or eos_id is None or pad_id is None:
|
| 103 |
+
raise RuntimeError("Tokenizer special tokens were not created correctly.")
|
| 104 |
+
|
| 105 |
+
tokenizer.post_processor = processors.TemplateProcessing(
|
| 106 |
+
single="<bos> $A <eos>",
|
| 107 |
+
pair="<bos> $A <eos> $B:1 <eos>:1",
|
| 108 |
+
special_tokens=[
|
| 109 |
+
("<bos>", bos_id),
|
| 110 |
+
("<eos>", eos_id),
|
| 111 |
+
],
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
tokenizer_path = output_dir / "tokenizer.json"
|
| 115 |
+
tokenizer.save(str(tokenizer_path))
|
| 116 |
+
|
| 117 |
+
metadata = {
|
| 118 |
+
"vocab_size": tokenizer.get_vocab_size(),
|
| 119 |
+
"special_tokens": {
|
| 120 |
+
"pad_token": "<pad>",
|
| 121 |
+
"bos_token": "<bos>",
|
| 122 |
+
"eos_token": "<eos>",
|
| 123 |
+
"unk_token": "<unk>",
|
| 124 |
+
"pad_token_id": pad_id,
|
| 125 |
+
"bos_token_id": bos_id,
|
| 126 |
+
"eos_token_id": eos_id,
|
| 127 |
+
"unk_token_id": tokenizer.token_to_id("<unk>"),
|
| 128 |
+
},
|
| 129 |
+
"data_config": data_config.to_dict(),
|
| 130 |
+
}
|
| 131 |
+
save_json(output_dir / "tokenizer_meta.json", metadata)
|
| 132 |
+
|
| 133 |
+
with (output_dir / "tokenizer_summary.json").open("w", encoding="utf-8") as handle:
|
| 134 |
+
json.dump(metadata, handle, ensure_ascii=False, indent=2)
|
| 135 |
+
|
| 136 |
+
logger.info("Tokenizer saved | path=%s", tokenizer_path)
|
| 137 |
+
logger.info(
|
| 138 |
+
"Tokenizer summary | vocab_size=%s pad_id=%s bos_id=%s eos_id=%s unk_id=%s",
|
| 139 |
+
tokenizer.get_vocab_size(),
|
| 140 |
+
pad_id,
|
| 141 |
+
bos_id,
|
| 142 |
+
eos_id,
|
| 143 |
+
tokenizer.token_to_id("<unk>"),
|
| 144 |
+
)
|
| 145 |
+
logger.info("Tokenizer metadata saved | path=%s", output_dir / "tokenizer_meta.json")
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
if __name__ == "__main__":
|
| 149 |
+
main()
|
src/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|