Lanni-ni commited on
Commit
efb516c
·
verified ·
1 Parent(s): c6912d2

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoints/step-000000209715200.pt +2 -2
  2. checkpoints/step-000000419430400.pt +2 -2
  3. checkpoints/step-000000629145600.pt +2 -2
  4. checkpoints/step-000000838860800.pt +2 -2
  5. checkpoints/step-000001048576000.pt +2 -2
  6. checkpoints/step-000001258291200.pt +2 -2
  7. checkpoints/step-000001468006400.pt +2 -2
  8. checkpoints/step-000001677721600.pt +2 -2
  9. checkpoints/step-000001887436800.pt +2 -2
  10. decay_params.txt +13 -13
  11. logs/2025-10-28_01-48-25.log +262 -0
  12. metrics/jsonlines/checkpoint.jsonl +9 -9
  13. metrics/jsonlines/norm.jsonl +0 -0
  14. metrics/jsonlines/throughput.jsonl +0 -0
  15. metrics/jsonlines/train.jsonl +98 -98
  16. metrics/jsonlines/train_eval.jsonl +19 -19
  17. metrics/jsonlines/val.jsonl +49 -49
  18. metrics/npz/train_eval/step-000000104857600.npz +1 -1
  19. metrics/npz/train_eval/step-000000209715200.npz +1 -1
  20. metrics/npz/train_eval/step-000000314572800.npz +1 -1
  21. metrics/npz/train_eval/step-000000419430400.npz +1 -1
  22. metrics/npz/train_eval/step-000000524288000.npz +1 -1
  23. metrics/npz/train_eval/step-000000629145600.npz +1 -1
  24. metrics/npz/train_eval/step-000000734003200.npz +1 -1
  25. metrics/npz/train_eval/step-000000838860800.npz +1 -1
  26. metrics/npz/train_eval/step-000000943718400.npz +1 -1
  27. metrics/npz/train_eval/step-000001048576000.npz +1 -1
  28. metrics/npz/train_eval/step-000001153433600.npz +1 -1
  29. metrics/npz/train_eval/step-000001258291200.npz +1 -1
  30. metrics/npz/train_eval/step-000001363148800.npz +1 -1
  31. metrics/npz/train_eval/step-000001468006400.npz +1 -1
  32. metrics/npz/train_eval/step-000001572864000.npz +1 -1
  33. metrics/npz/train_eval/step-000001677721600.npz +1 -1
  34. metrics/npz/train_eval/step-000001782579200.npz +1 -1
  35. metrics/npz/train_eval/step-000001887436800.npz +1 -1
  36. metrics/npz/train_eval/step-000001992294400.npz +1 -1
  37. metrics/npz/val/step-000000041943040.npz +1 -1
  38. metrics/npz/val/step-000000083886080.npz +1 -1
  39. metrics/npz/val/step-000000125829120.npz +1 -1
  40. metrics/npz/val/step-000000167772160.npz +1 -1
  41. metrics/npz/val/step-000000209715200.npz +1 -1
  42. metrics/npz/val/step-000000251658240.npz +1 -1
  43. metrics/npz/val/step-000000293601280.npz +1 -1
  44. metrics/npz/val/step-000000335544320.npz +1 -1
  45. metrics/npz/val/step-000000377487360.npz +1 -1
  46. metrics/npz/val/step-000000419430400.npz +1 -1
  47. metrics/npz/val/step-000000461373440.npz +1 -1
  48. metrics/npz/val/step-000000503316480.npz +1 -1
  49. metrics/npz/val/step-000000545259520.npz +1 -1
  50. metrics/npz/val/step-000000587202560.npz +1 -1
checkpoints/step-000000209715200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b3dc0d8bd5520cae4d864552463a2886fa07b73cdc47d2a37ce83bb212dc323
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8285b1275f636f612d151dec02c5d81f4b8c1db325da20ed27c3c0906cdcfd0b
3
+ size 329409666
checkpoints/step-000000419430400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:197c3ce85adaa7c75aca1bc28dba6de4e903f1932840a3477ccbfe5081dd4411
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1d5a3eb419481d7280395e792e83aec56b0f40085ed2e8c52bd911b3fb3be0
3
+ size 329409666
checkpoints/step-000000629145600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ffa926c857f4187d24fb8c1476f007f3115e50ef587c9e0addd80d3a5cde704
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:135e47df1a60a84d73797ebba91af1b93cc5a876edb699f61fdaebf4c735a1da
3
+ size 329409666
checkpoints/step-000000838860800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bbf8089487ef6155cd623dc477cbcb09d8e14350f5dd57ac8188890a9c1cc47
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:849a49cdb5bf1ef2b9da3c7ed39cb6ed724793a95f400f524d37fd50a631e3a6
3
+ size 329409666
checkpoints/step-000001048576000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08caa24342ad8da2da4ecb5be835df6926f912ed46c8a3348ac0c43af76063a8
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72ada714c05b103600ae9b70aba26b160d5eb815b6399ce0c5bce3f3cdc8cd63
3
+ size 329409666
checkpoints/step-000001258291200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd1fd780b449e6a9a027a41fe8410eaf18503d300ed7b88ad0746df2a0301c82
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df68891154b50e797f1ac781f9ccba307e3516009e0b4fe66b67ffb0e7811cee
3
+ size 329409666
checkpoints/step-000001468006400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33e749571b391d13f25672d73a33515e15822b76b028f65d3e0edd8942e603bb
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8166d4070b53b6d58ce1ff21a80c08f4730d11eaeb1274a1edc9502ffe1e585
3
+ size 329409666
checkpoints/step-000001677721600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:476798da159b1371c534ffcd6919778429ba1178a2bde18a0609f87a832989f3
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0812b579499302700f9a59b19cca469b7b235b15091dd742378e33e33d6e3734
3
+ size 329409666
checkpoints/step-000001887436800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bbd8a2e9bf71df863966b176483676d325b894dc43f9defbebba152af38ea84
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78266535cd097dcd343cfc17a80c83741e912e4e4afcb575e500e817fdcfe2f6
3
+ size 329409666
decay_params.txt CHANGED
@@ -1,14 +1,14 @@
1
- _forward_module._fsdp_wrapped_module.model.embeddings.weight
2
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight
3
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight
4
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight
5
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight
6
- _forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight
7
- _forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight
8
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight
9
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight
10
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight
11
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight
12
- _forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight
13
- _forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight
14
  _forward_module._fsdp_wrapped_module.lm_head.weight
 
1
+ _forward_module._fsdp_wrapped_module.emb.weight
2
+ _forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight
3
+ _forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight
4
+ _forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight
5
+ _forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight
6
+ _forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight
7
+ _forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight
8
+ _forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight
9
+ _forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight
10
+ _forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight
11
+ _forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight
12
+ _forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight
13
+ _forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight
14
  _forward_module._fsdp_wrapped_module.lm_head.weight
logs/2025-10-28_01-48-25.log ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-28 01:48:25][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256`
2
+ [2025-10-28 01:48:25][train:375][INFO] Configuration:
3
+ [2025-10-28 01:48:25][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml.
4
+ [2025-10-28 01:48:25][train:387][INFO] creating datamodule
5
+ [2025-10-28 01:48:25][train:419][INFO] creating model
6
+ [2025-10-28 01:48:25][train:440][INFO] creating optimizer
7
+ [2025-10-28 01:48:25][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-28 01:48:25][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-28 01:48:25][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-28 01:48:26][logger:288][INFO] wandb initialized. Run id: gwmp4t3h
11
+ [2025-10-28 01:48:26][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/resume.jsonl since we are not resuming
13
+ [2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train_data_info.jsonl since we are not resuming
14
+ [2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/val_data_info.jsonl since we are not resuming
15
+ [2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/model_info.jsonl since we are not resuming
16
+ [2025-10-28 01:48:26][logger:113][INFO] Setting up npz logger...
17
+ [2025-10-28 01:48:26][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
18
+ [2025-10-28 01:48:26][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
19
+ [2025-10-28 01:48:26][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128]
20
+ [2025-10-28 01:49:37][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:01:09] [ETA: 1:55:28] [loss: 9.774] [tokens/s: 309942.692] [batches/s: 0.148] [MFU: 0.000] [TFLOPS: 0.000]
21
+ [2025-10-28 01:50:44][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:02:17] [ETA: 1:52:23] [loss: 8.196] [tokens/s: 309977.378] [batches/s: 0.148] [MFU: 0.000] [TFLOPS: 0.000]
22
+ [2025-10-28 01:50:44][train:194][INFO] Running validation...
23
+ [2025-10-28 01:52:25][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 137.633] [val/train_update_time: 137.344] [val/loss: 8.098] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.750] [val/val_tokens_per_second: 406549.998] [val/loss_avg_len_2048: 8.098] [val/perplexity_len_2048: 3287.787] [val/loss_avg_len_1024: 8.101] [val/perplexity_len_1024: 3296.261] [val/loss_avg_len_512: 8.101] [val/perplexity_len_512: 3297.223]
24
+ [2025-10-28 01:53:33][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:05:06] [ETA: 2:44:55] [loss: 7.712] [tokens/s: 204800.058] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000]
25
+ [2025-10-28 01:54:40][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:06:13] [ETA: 2:29:28] [loss: 7.524] [tokens/s: 224321.330] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
26
+ [2025-10-28 01:54:40][train:194][INFO] Running validation...
27
+ [2025-10-28 01:56:21][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 373.694] [val/train_update_time: 272.436] [val/loss: 7.510] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.374] [val/val_tokens_per_second: 408074.046] [val/loss_avg_len_2048: 7.510] [val/perplexity_len_2048: 1825.331] [val/loss_avg_len_1024: 7.513] [val/perplexity_len_1024: 1831.575] [val/loss_avg_len_512: 7.515] [val/perplexity_len_512: 1834.639]
28
+ [2025-10-28 01:57:28][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:09:01] [ETA: 2:51:32] [loss: 7.361] [tokens/s: 192927.320] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
29
+ [2025-10-28 01:57:28][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 541.726] [train_eval/train_update_time: 339.966] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.431] [train_eval/perplexity_len_2048: 4588.366] [train_eval/loss_avg_len_1024: 8.436] [train_eval/perplexity_len_1024: 4609.090] [train_eval/loss_avg_len_512: 8.436] [train_eval/perplexity_len_512: 4609.426]
30
+ [2025-10-28 01:58:36][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:10:09] [ETA: 2:39:06] [loss: 7.195] [tokens/s: 206123.202] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000]
31
+ [2025-10-28 01:58:36][train:194][INFO] Running validation...
32
+ [2025-10-28 02:00:16][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 609.370] [val/train_update_time: 407.492] [val/loss: 7.194] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.850] [val/val_tokens_per_second: 410215.493] [val/loss_avg_len_2048: 7.194] [val/perplexity_len_2048: 1331.632] [val/loss_avg_len_1024: 7.199] [val/perplexity_len_1024: 1338.256] [val/loss_avg_len_512: 7.204] [val/perplexity_len_512: 1345.080]
33
+ [2025-10-28 02:01:23][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:12:56] [ETA: 2:52:00] [loss: 7.097] [tokens/s: 188480.049] [batches/s: 0.090] [MFU: 0.000] [TFLOPS: 0.000]
34
+ [2025-10-28 02:02:31][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:14:04] [ETA: 2:41:51] [loss: 6.985] [tokens/s: 198329.429] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
35
+ [2025-10-28 02:02:31][train:194][INFO] Running validation...
36
+ [2025-10-28 02:04:12][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 844.441] [val/train_update_time: 542.489] [val/loss: 6.967] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.669] [val/val_tokens_per_second: 406876.866] [val/loss_avg_len_2048: 6.967] [val/perplexity_len_2048: 1060.792] [val/loss_avg_len_1024: 6.973] [val/perplexity_len_1024: 1067.275] [val/loss_avg_len_512: 6.980] [val/perplexity_len_512: 1074.812]
37
+ [2025-10-28 02:05:19][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:16:52] [ETA: 2:50:39] [loss: 6.864] [tokens/s: 185967.467] [batches/s: 0.089] [MFU: 0.000] [TFLOPS: 0.000]
38
+ [2025-10-28 02:06:27][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:18:00] [ETA: 2:42:03] [loss: 6.781] [tokens/s: 193805.258] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
39
+ [2025-10-28 02:06:27][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1080.360] [train_eval/train_update_time: 677.513] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 7.043] [train_eval/perplexity_len_2048: 1145.164] [train_eval/loss_avg_len_1024: 7.050] [train_eval/perplexity_len_1024: 1152.943] [train_eval/loss_avg_len_512: 7.055] [train_eval/perplexity_len_512: 1158.738]
40
+ [2025-10-28 02:06:27][train:194][INFO] Running validation...
41
+ [2025-10-28 02:08:07][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 1080.360] [val/train_update_time: 677.513] [val/loss: 6.774] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.503] [val/val_tokens_per_second: 407549.903] [val/loss_avg_len_2048: 6.774] [val/perplexity_len_2048: 874.407] [val/loss_avg_len_1024: 6.781] [val/perplexity_len_1024: 880.950] [val/loss_avg_len_512: 6.790] [val/perplexity_len_512: 888.990]
42
+ [2025-10-28 02:08:07][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt...
43
+ [2025-10-28 02:08:08][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt.
44
+ [2025-10-28 02:08:08][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.455]
45
+ [2025-10-28 02:09:16][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:20:48] [ETA: 2:48:25] [loss: 6.698] [tokens/s: 177119.434] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
46
+ [2025-10-28 02:10:23][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:21:56] [ETA: 2:40:54] [loss: 6.593] [tokens/s: 193779.584] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
47
+ [2025-10-28 02:10:23][train:194][INFO] Running validation...
48
+ [2025-10-28 02:12:04][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1316.574] [val/train_update_time: 812.517] [val/loss: 6.612] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.810] [val/val_tokens_per_second: 406308.306] [val/loss_avg_len_2048: 6.612] [val/perplexity_len_2048: 744.079] [val/loss_avg_len_1024: 6.621] [val/perplexity_len_1024: 750.549] [val/loss_avg_len_512: 6.632] [val/perplexity_len_512: 758.811]
49
+ [2025-10-28 02:13:12][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:24:45] [ETA: 2:45:38] [loss: 6.562] [tokens/s: 177120.861] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
50
+ [2025-10-28 02:14:19][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:25:52] [ETA: 2:38:57] [loss: 6.505] [tokens/s: 193716.354] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
51
+ [2025-10-28 02:14:19][train:194][INFO] Running validation...
52
+ [2025-10-28 02:16:00][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1552.606] [val/train_update_time: 947.496] [val/loss: 6.478] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.429] [val/val_tokens_per_second: 407851.660] [val/loss_avg_len_2048: 6.478] [val/perplexity_len_2048: 650.688] [val/loss_avg_len_1024: 6.488] [val/perplexity_len_1024: 657.094] [val/loss_avg_len_512: 6.500] [val/perplexity_len_512: 665.465]
53
+ [2025-10-28 02:17:07][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:28:40] [ETA: 2:42:30] [loss: 6.406] [tokens/s: 177126.378] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
54
+ [2025-10-28 02:17:07][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1720.648] [train_eval/train_update_time: 1014.977] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.585] [train_eval/perplexity_len_2048: 723.963] [train_eval/loss_avg_len_1024: 6.593] [train_eval/perplexity_len_1024: 729.975] [train_eval/loss_avg_len_512: 6.604] [train_eval/perplexity_len_512: 737.766]
55
+ [2025-10-28 02:18:15][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:29:48] [ETA: 2:36:28] [loss: 6.375] [tokens/s: 193628.897] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
56
+ [2025-10-28 02:18:15][train:194][INFO] Running validation...
57
+ [2025-10-28 02:19:55][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1788.245] [val/train_update_time: 1082.469] [val/loss: 6.371] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.337] [val/val_tokens_per_second: 408224.188] [val/loss_avg_len_2048: 6.371] [val/perplexity_len_2048: 584.802] [val/loss_avg_len_1024: 6.382] [val/perplexity_len_1024: 591.124] [val/loss_avg_len_512: 6.396] [val/perplexity_len_512: 599.393]
58
+ [2025-10-28 02:21:03][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:32:36] [ETA: 2:39:10] [loss: 6.300] [tokens/s: 177058.054] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
59
+ [2025-10-28 02:22:10][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:33:43] [ETA: 2:33:39] [loss: 6.269] [tokens/s: 193687.228] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
60
+ [2025-10-28 02:22:10][train:194][INFO] Running validation...
61
+ [2025-10-28 02:23:51][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 2023.812] [val/train_update_time: 1217.452] [val/loss: 6.274] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.357] [val/val_tokens_per_second: 408143.018] [val/loss_avg_len_2048: 6.274] [val/perplexity_len_2048: 530.567] [val/loss_avg_len_1024: 6.286] [val/perplexity_len_1024: 536.794] [val/loss_avg_len_512: 6.301] [val/perplexity_len_512: 545.219]
62
+ [2025-10-28 02:24:58][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:36:31] [ETA: 2:35:43] [loss: 6.286] [tokens/s: 177107.743] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
63
+ [2025-10-28 02:26:06][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:37:39] [ETA: 2:30:37] [loss: 6.203] [tokens/s: 193804.458] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
64
+ [2025-10-28 02:26:06][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2259.375] [train_eval/train_update_time: 1352.433] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.304] [train_eval/perplexity_len_2048: 546.957] [train_eval/loss_avg_len_1024: 6.316] [train_eval/perplexity_len_1024: 553.212] [train_eval/loss_avg_len_512: 6.330] [train_eval/perplexity_len_512: 560.967]
65
+ [2025-10-28 02:26:06][train:194][INFO] Running validation...
66
+ [2025-10-28 02:27:46][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 2259.375] [val/train_update_time: 1352.433] [val/loss: 6.200] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.386] [val/val_tokens_per_second: 408026.058] [val/loss_avg_len_2048: 6.200] [val/perplexity_len_2048: 492.537] [val/loss_avg_len_1024: 6.212] [val/perplexity_len_1024: 498.572] [val/loss_avg_len_512: 6.228] [val/perplexity_len_512: 506.921]
67
+ [2025-10-28 02:27:46][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt...
68
+ [2025-10-28 02:27:47][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt.
69
+ [2025-10-28 02:27:47][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.442]
70
+ [2025-10-28 02:28:54][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:40:27] [ETA: 2:32:13] [loss: 6.156] [tokens/s: 177134.593] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
71
+ [2025-10-28 02:30:02][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:41:35] [ETA: 2:27:27] [loss: 6.146] [tokens/s: 193804.240] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
72
+ [2025-10-28 02:30:02][train:194][INFO] Running validation...
73
+ [2025-10-28 02:31:43][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2495.437] [val/train_update_time: 1487.442] [val/loss: 6.132] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.703] [val/val_tokens_per_second: 406740.433] [val/loss_avg_len_2048: 6.132] [val/perplexity_len_2048: 460.365] [val/loss_avg_len_1024: 6.145] [val/perplexity_len_1024: 466.377] [val/loss_avg_len_512: 6.163] [val/perplexity_len_512: 474.794]
74
+ [2025-10-28 02:32:50][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:44:23] [ETA: 2:28:37] [loss: 6.099] [tokens/s: 177146.443] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
75
+ [2025-10-28 02:33:58][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:45:31] [ETA: 2:24:09] [loss: 6.058] [tokens/s: 193740.553] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
76
+ [2025-10-28 02:33:58][train:194][INFO] Running validation...
77
+ [2025-10-28 02:35:39][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2731.434] [val/train_update_time: 1622.491] [val/loss: 6.077] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.997] [val/val_tokens_per_second: 405558.321] [val/loss_avg_len_2048: 6.077] [val/perplexity_len_2048: 435.701] [val/loss_avg_len_1024: 6.091] [val/perplexity_len_1024: 441.655] [val/loss_avg_len_512: 6.109] [val/perplexity_len_512: 450.082]
78
+ [2025-10-28 02:36:47][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:48:20] [ETA: 2:25:00] [loss: 6.051] [tokens/s: 177048.600] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
79
+ [2025-10-28 02:36:47][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2900.077] [train_eval/train_update_time: 1690.001] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.119] [train_eval/perplexity_len_2048: 454.201] [train_eval/loss_avg_len_1024: 6.129] [train_eval/perplexity_len_1024: 458.908] [train_eval/loss_avg_len_512: 6.145] [train_eval/perplexity_len_512: 466.193]
80
+ [2025-10-28 02:37:54][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:49:27] [ETA: 2:20:46] [loss: 6.019] [tokens/s: 193606.916] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
81
+ [2025-10-28 02:37:54][train:194][INFO] Running validation...
82
+ [2025-10-28 02:39:35][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2967.730] [val/train_update_time: 1757.532] [val/loss: 6.025] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.815] [val/val_tokens_per_second: 406288.925] [val/loss_avg_len_2048: 6.025] [val/perplexity_len_2048: 413.628] [val/loss_avg_len_1024: 6.039] [val/perplexity_len_1024: 419.547] [val/loss_avg_len_512: 6.059] [val/perplexity_len_512: 427.893]
83
+ [2025-10-28 02:40:43][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:52:16] [ETA: 2:21:19] [loss: 6.010] [tokens/s: 176963.151] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
84
+ [2025-10-28 02:41:50][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:53:23] [ETA: 2:17:18] [loss: 5.960] [tokens/s: 193511.888] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
85
+ [2025-10-28 02:41:50][train:194][INFO] Running validation...
86
+ [2025-10-28 02:43:31][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 3203.841] [val/train_update_time: 1892.586] [val/loss: 5.974] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.200] [val/val_tokens_per_second: 408780.967] [val/loss_avg_len_2048: 5.974] [val/perplexity_len_2048: 392.953] [val/loss_avg_len_1024: 5.988] [val/perplexity_len_1024: 398.795] [val/loss_avg_len_512: 6.009] [val/perplexity_len_512: 407.195]
87
+ [2025-10-28 02:44:38][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:56:11] [ETA: 2:17:34] [loss: 5.931] [tokens/s: 176978.071] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
88
+ [2025-10-28 02:45:46][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:57:19] [ETA: 2:13:45] [loss: 5.915] [tokens/s: 193613.347] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
89
+ [2025-10-28 02:45:46][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3439.312] [train_eval/train_update_time: 2027.637] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.993] [train_eval/perplexity_len_2048: 400.721] [train_eval/loss_avg_len_1024: 6.003] [train_eval/perplexity_len_1024: 404.814] [train_eval/loss_avg_len_512: 6.021] [train_eval/perplexity_len_512: 411.889]
90
+ [2025-10-28 02:45:46][train:194][INFO] Running validation...
91
+ [2025-10-28 02:47:27][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3439.312] [val/train_update_time: 2027.637] [val/loss: 5.937] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.627] [val/val_tokens_per_second: 407047.134] [val/loss_avg_len_2048: 5.937] [val/perplexity_len_2048: 378.618] [val/loss_avg_len_1024: 5.952] [val/perplexity_len_1024: 384.436] [val/loss_avg_len_512: 5.974] [val/perplexity_len_512: 392.897]
92
+ [2025-10-28 02:47:27][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt...
93
+ [2025-10-28 02:47:27][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt.
94
+ [2025-10-28 02:47:27][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.424]
95
+ [2025-10-28 02:48:35][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 1:00:07] [ETA: 2:13:50] [loss: 5.956] [tokens/s: 176935.217] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
96
+ [2025-10-28 02:49:42][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 1:01:15] [ETA: 2:10:10] [loss: 5.876] [tokens/s: 193535.378] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
97
+ [2025-10-28 02:49:42][train:194][INFO] Running validation...
98
+ [2025-10-28 02:51:23][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3675.684] [val/train_update_time: 2162.691] [val/loss: 5.899] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.480] [val/val_tokens_per_second: 407644.252] [val/loss_avg_len_2048: 5.899] [val/perplexity_len_2048: 364.765] [val/loss_avg_len_1024: 5.915] [val/perplexity_len_1024: 370.535] [val/loss_avg_len_512: 5.938] [val/perplexity_len_512: 379.101]
99
+ [2025-10-28 02:52:30][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:04:03] [ETA: 2:10:04] [loss: 5.916] [tokens/s: 176959.550] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
100
+ [2025-10-28 02:53:38][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:05:11] [ETA: 2:06:32] [loss: 5.878] [tokens/s: 193631.925] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
101
+ [2025-10-28 02:53:38][train:194][INFO] Running validation...
102
+ [2025-10-28 02:55:18][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3911.438] [val/train_update_time: 2297.710] [val/loss: 5.866] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.256] [val/val_tokens_per_second: 408554.595] [val/loss_avg_len_2048: 5.866] [val/perplexity_len_2048: 352.944] [val/loss_avg_len_1024: 5.882] [val/perplexity_len_1024: 358.687] [val/loss_avg_len_512: 5.906] [val/perplexity_len_512: 367.231]
103
+ [2025-10-28 02:56:26][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:07:59] [ETA: 2:06:15] [loss: 5.850] [tokens/s: 177072.118] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
104
+ [2025-10-28 02:56:26][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4079.355] [train_eval/train_update_time: 2365.248] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.894] [train_eval/perplexity_len_2048: 362.793] [train_eval/loss_avg_len_1024: 5.908] [train_eval/perplexity_len_1024: 367.821] [train_eval/loss_avg_len_512: 5.928] [train_eval/perplexity_len_512: 375.441]
105
+ [2025-10-28 02:57:34][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:09:07] [ETA: 2:02:52] [loss: 5.787] [tokens/s: 193731.211] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
106
+ [2025-10-28 02:57:34][train:194][INFO] Running validation...
107
+ [2025-10-28 02:59:14][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 4147.001] [val/train_update_time: 2432.774] [val/loss: 5.838] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.454] [val/val_tokens_per_second: 407750.244] [val/loss_avg_len_2048: 5.838] [val/perplexity_len_2048: 343.229] [val/loss_avg_len_1024: 5.855] [val/perplexity_len_1024: 348.990] [val/loss_avg_len_512: 5.879] [val/perplexity_len_512: 357.566]
108
+ [2025-10-28 03:00:22][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:11:55] [ETA: 2:02:27] [loss: 5.819] [tokens/s: 177128.887] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
109
+ [2025-10-28 03:01:29][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:13:02] [ETA: 1:59:10] [loss: 5.792] [tokens/s: 193690.683] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
110
+ [2025-10-28 03:01:29][train:194][INFO] Running validation...
111
+ [2025-10-28 03:03:10][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 4382.723] [val/train_update_time: 2567.802] [val/loss: 5.807] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.155] [val/val_tokens_per_second: 408964.937] [val/loss_avg_len_2048: 5.807] [val/perplexity_len_2048: 332.631] [val/loss_avg_len_1024: 5.824] [val/perplexity_len_1024: 338.388] [val/loss_avg_len_512: 5.850] [val/perplexity_len_512: 347.116]
112
+ [2025-10-28 03:04:17][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:15:50] [ETA: 1:58:37] [loss: 5.811] [tokens/s: 177137.275] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
113
+ [2025-10-28 03:05:25][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:16:58] [ETA: 1:55:27] [loss: 5.713] [tokens/s: 193852.270] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
114
+ [2025-10-28 03:05:25][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4618.154] [train_eval/train_update_time: 2702.834] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.817] [train_eval/perplexity_len_2048: 336.075] [train_eval/loss_avg_len_1024: 5.830] [train_eval/perplexity_len_1024: 340.317] [train_eval/loss_avg_len_512: 5.851] [train_eval/perplexity_len_512: 347.695]
115
+ [2025-10-28 03:05:25][train:194][INFO] Running validation...
116
+ [2025-10-28 03:07:05][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4618.154] [val/train_update_time: 2702.834] [val/loss: 5.782] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.075] [val/val_tokens_per_second: 409291.934] [val/loss_avg_len_2048: 5.782] [val/perplexity_len_2048: 324.540] [val/loss_avg_len_1024: 5.800] [val/perplexity_len_1024: 330.279] [val/loss_avg_len_512: 5.826] [val/perplexity_len_512: 339.046]
117
+ [2025-10-28 03:07:05][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt...
118
+ [2025-10-28 03:07:05][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt.
119
+ [2025-10-28 03:07:05][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.425]
120
+ [2025-10-28 03:08:13][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:19:46] [ETA: 1:54:47] [loss: 5.736] [tokens/s: 177219.607] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
121
+ [2025-10-28 03:09:21][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:20:53] [ETA: 1:51:43] [loss: 5.749] [tokens/s: 193860.014] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
122
+ [2025-10-28 03:09:21][train:194][INFO] Running validation...
123
+ [2025-10-28 03:11:00][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4853.921] [val/train_update_time: 2837.866] [val/loss: 5.759] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.822] [val/val_tokens_per_second: 410332.277] [val/loss_avg_len_2048: 5.759] [val/perplexity_len_2048: 316.982] [val/loss_avg_len_1024: 5.777] [val/perplexity_len_1024: 322.744] [val/loss_avg_len_512: 5.804] [val/perplexity_len_512: 331.528]
124
+ [2025-10-28 03:12:08][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:23:41] [ETA: 1:50:56] [loss: 5.747] [tokens/s: 177330.961] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
125
+ [2025-10-28 03:13:16][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:24:49] [ETA: 1:47:56] [loss: 5.780] [tokens/s: 193937.170] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
126
+ [2025-10-28 03:13:16][train:194][INFO] Running validation...
127
+ [2025-10-28 03:14:56][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 5089.012] [val/train_update_time: 2972.891] [val/loss: 5.737] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.327] [val/val_tokens_per_second: 408266.010] [val/loss_avg_len_2048: 5.737] [val/perplexity_len_2048: 310.286] [val/loss_avg_len_1024: 5.756] [val/perplexity_len_1024: 316.032] [val/loss_avg_len_512: 5.784] [val/perplexity_len_512: 324.904]
128
+ [2025-10-28 03:16:04][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:27:36] [ETA: 1:47:05] [loss: 5.723] [tokens/s: 177321.705] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
129
+ [2025-10-28 03:16:04][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5256.981] [train_eval/train_update_time: 3040.416] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.752] [train_eval/perplexity_len_2048: 314.970] [train_eval/loss_avg_len_1024: 5.769] [train_eval/perplexity_len_1024: 320.169] [train_eval/loss_avg_len_512: 5.795] [train_eval/perplexity_len_512: 328.749]
130
+ [2025-10-28 03:17:11][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:28:44] [ETA: 1:44:10] [loss: 5.698] [tokens/s: 193963.049] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
131
+ [2025-10-28 03:17:11][train:194][INFO] Running validation...
132
+ [2025-10-28 03:18:51][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 5324.629] [val/train_update_time: 3107.943] [val/loss: 5.716] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.807] [val/val_tokens_per_second: 410393.901] [val/loss_avg_len_2048: 5.716] [val/perplexity_len_2048: 303.782] [val/loss_avg_len_1024: 5.735] [val/perplexity_len_1024: 309.495] [val/loss_avg_len_512: 5.763] [val/perplexity_len_512: 318.364]
133
+ [2025-10-28 03:19:59][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:31:32] [ETA: 1:43:13] [loss: 5.713] [tokens/s: 177415.998] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
134
+ [2025-10-28 03:21:06][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:32:39] [ETA: 1:40:23] [loss: 5.699] [tokens/s: 194019.719] [batches/s: 0.093] [MFU: 0.000] [TFLOPS: 0.000]
135
+ [2025-10-28 03:21:06][train:194][INFO] Running validation...
136
+ [2025-10-28 03:22:47][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 5559.744] [val/train_update_time: 3242.999] [val/loss: 5.701] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.360] [val/val_tokens_per_second: 408130.422] [val/loss_avg_len_2048: 5.701] [val/perplexity_len_2048: 299.093] [val/loss_avg_len_1024: 5.720] [val/perplexity_len_1024: 304.853] [val/loss_avg_len_512: 5.749] [val/perplexity_len_512: 313.834]
137
+ [2025-10-28 03:23:54][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:35:27] [ETA: 1:39:21] [loss: 5.691] [tokens/s: 177382.867] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
138
+ [2025-10-28 03:25:02][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:36:35] [ETA: 1:36:35] [loss: 5.680] [tokens/s: 194044.503] [batches/s: 0.093] [MFU: 0.000] [TFLOPS: 0.000]
139
+ [2025-10-28 03:25:02][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5795.387] [train_eval/train_update_time: 3378.042] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.703] [train_eval/perplexity_len_2048: 299.686] [train_eval/loss_avg_len_1024: 5.718] [train_eval/perplexity_len_1024: 304.318] [train_eval/loss_avg_len_512: 5.746] [train_eval/perplexity_len_512: 312.890]
140
+ [2025-10-28 03:25:02][train:194][INFO] Running validation...
141
+ [2025-10-28 03:26:42][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5795.387] [val/train_update_time: 3378.042] [val/loss: 5.679] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.290] [val/val_tokens_per_second: 408416.518] [val/loss_avg_len_2048: 5.679] [val/perplexity_len_2048: 292.634] [val/loss_avg_len_1024: 5.698] [val/perplexity_len_1024: 298.414] [val/loss_avg_len_512: 5.728] [val/perplexity_len_512: 307.462]
142
+ [2025-10-28 03:26:42][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt...
143
+ [2025-10-28 03:26:43][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt.
144
+ [2025-10-28 03:26:43][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.435]
145
+ [2025-10-28 03:27:50][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:39:23] [ETA: 1:35:29] [loss: 5.678] [tokens/s: 177347.327] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
146
+ [2025-10-28 03:28:58][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:40:31] [ETA: 1:32:47] [loss: 5.672] [tokens/s: 193874.283] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
147
+ [2025-10-28 03:28:58][train:194][INFO] Running validation...
148
+ [2025-10-28 03:30:38][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 6031.410] [val/train_update_time: 3513.095] [val/loss: 5.663] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.906] [val/val_tokens_per_second: 409985.253] [val/loss_avg_len_2048: 5.663] [val/perplexity_len_2048: 287.938] [val/loss_avg_len_1024: 5.683] [val/perplexity_len_1024: 293.773] [val/loss_avg_len_512: 5.713] [val/perplexity_len_512: 302.903]
149
+ [2025-10-28 03:31:46][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:43:18] [ETA: 1:31:37] [loss: 5.642] [tokens/s: 177326.588] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
150
+ [2025-10-28 03:32:53][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:44:26] [ETA: 1:28:58] [loss: 5.654] [tokens/s: 193938.037] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
151
+ [2025-10-28 03:32:53][train:194][INFO] Running validation...
152
+ [2025-10-28 03:34:33][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 6266.654] [val/train_update_time: 3648.162] [val/loss: 5.651] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.816] [val/val_tokens_per_second: 410355.068] [val/loss_avg_len_2048: 5.651] [val/perplexity_len_2048: 284.515] [val/loss_avg_len_1024: 5.671] [val/perplexity_len_1024: 290.340] [val/loss_avg_len_512: 5.702] [val/perplexity_len_512: 299.459]
153
+ [2025-10-28 03:35:41][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:47:14] [ETA: 1:27:44] [loss: 5.601] [tokens/s: 177398.714] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
154
+ [2025-10-28 03:35:41][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6434.099] [train_eval/train_update_time: 3715.676] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.656] [train_eval/perplexity_len_2048: 285.901] [train_eval/loss_avg_len_1024: 5.670] [train_eval/perplexity_len_1024: 289.963] [train_eval/loss_avg_len_512: 5.696] [train_eval/perplexity_len_512: 297.761]
155
+ [2025-10-28 03:36:48][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:48:21] [ETA: 1:25:08] [loss: 5.638] [tokens/s: 193941.089] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
156
+ [2025-10-28 03:36:48][train:194][INFO] Running validation...
157
+ [2025-10-28 03:38:29][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 6501.730] [val/train_update_time: 3783.195] [val/loss: 5.633] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.261] [val/val_tokens_per_second: 408531.878] [val/loss_avg_len_2048: 5.633] [val/perplexity_len_2048: 279.424] [val/loss_avg_len_1024: 5.653] [val/perplexity_len_1024: 285.247] [val/loss_avg_len_512: 5.685] [val/perplexity_len_512: 294.396]
158
+ [2025-10-28 03:39:36][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:51:09] [ETA: 1:23:51] [loss: 5.594] [tokens/s: 177336.460] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
159
+ [2025-10-28 03:40:44][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:52:17] [ETA: 1:21:18] [loss: 5.651] [tokens/s: 193966.303] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
160
+ [2025-10-28 03:40:44][train:194][INFO] Running validation...
161
+ [2025-10-28 03:42:24][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 6737.261] [val/train_update_time: 3918.230] [val/loss: 5.620] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.222] [val/val_tokens_per_second: 408691.590] [val/loss_avg_len_2048: 5.620] [val/perplexity_len_2048: 275.776] [val/loss_avg_len_1024: 5.641] [val/perplexity_len_1024: 281.647] [val/loss_avg_len_512: 5.673] [val/perplexity_len_512: 290.917]
162
+ [2025-10-28 03:43:32][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:55:05] [ETA: 1:19:58] [loss: 5.652] [tokens/s: 177362.441] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
163
+ [2025-10-28 03:44:39][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:56:12] [ETA: 1:17:28] [loss: 5.643] [tokens/s: 194070.780] [batches/s: 0.093] [MFU: 0.000] [TFLOPS: 0.000]
164
+ [2025-10-28 03:44:39][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6972.699] [train_eval/train_update_time: 4053.228] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.618] [train_eval/perplexity_len_2048: 275.425] [train_eval/loss_avg_len_1024: 5.633] [train_eval/perplexity_len_1024: 279.583] [train_eval/loss_avg_len_512: 5.662] [train_eval/perplexity_len_512: 287.863]
165
+ [2025-10-28 03:44:39][train:194][INFO] Running validation...
166
+ [2025-10-28 03:46:20][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6972.699] [val/train_update_time: 4053.228] [val/loss: 5.607] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.519] [val/val_tokens_per_second: 407485.827] [val/loss_avg_len_2048: 5.607] [val/perplexity_len_2048: 272.312] [val/loss_avg_len_1024: 5.628] [val/perplexity_len_1024: 278.191] [val/loss_avg_len_512: 5.661] [val/perplexity_len_512: 287.474]
167
+ [2025-10-28 03:46:20][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt...
168
+ [2025-10-28 03:46:20][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt.
169
+ [2025-10-28 03:46:20][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.547]
170
+ [2025-10-28 03:47:28][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:59:01] [ETA: 1:16:05] [loss: 5.618] [tokens/s: 177320.138] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
171
+ [2025-10-28 03:48:36][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 2:00:08] [ETA: 1:13:38] [loss: 5.589] [tokens/s: 193873.657] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
172
+ [2025-10-28 03:48:36][train:194][INFO] Running validation...
173
+ [2025-10-28 03:50:16][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 7208.983] [val/train_update_time: 4188.229] [val/loss: 5.596] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.085] [val/val_tokens_per_second: 409252.574] [val/loss_avg_len_2048: 5.596] [val/perplexity_len_2048: 269.274] [val/loss_avg_len_1024: 5.617] [val/perplexity_len_1024: 275.188] [val/loss_avg_len_512: 5.651] [val/perplexity_len_512: 284.560]
174
+ [2025-10-28 03:51:23][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 2:02:56] [ETA: 1:12:12] [loss: 5.613] [tokens/s: 177308.237] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
175
+ [2025-10-28 03:52:31][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:04:04] [ETA: 1:09:47] [loss: 5.598] [tokens/s: 193846.873] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
176
+ [2025-10-28 03:52:31][train:194][INFO] Running validation...
177
+ [2025-10-28 03:54:11][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 7444.285] [val/train_update_time: 4323.227] [val/loss: 5.585] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.351] [val/val_tokens_per_second: 408168.991] [val/loss_avg_len_2048: 5.585] [val/perplexity_len_2048: 266.352] [val/loss_avg_len_1024: 5.607] [val/perplexity_len_1024: 272.291] [val/loss_avg_len_512: 5.641] [val/perplexity_len_512: 281.717]
178
+ [2025-10-28 03:55:19][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:06:52] [ETA: 1:08:18] [loss: 5.568] [tokens/s: 177245.482] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
179
+ [2025-10-28 03:55:19][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7612.232] [train_eval/train_update_time: 4390.719] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.597] [train_eval/perplexity_len_2048: 269.491] [train_eval/loss_avg_len_1024: 5.617] [train_eval/perplexity_len_1024: 274.991] [train_eval/loss_avg_len_512: 5.647] [train_eval/perplexity_len_512: 283.339]
180
+ [2025-10-28 03:56:26][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:07:59] [ETA: 1:05:56] [loss: 5.591] [tokens/s: 193842.688] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
181
+ [2025-10-28 03:56:26][train:194][INFO] Running validation...
182
+ [2025-10-28 03:58:06][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 7679.832] [val/train_update_time: 4458.219] [val/loss: 5.575] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.008] [val/val_tokens_per_second: 409566.848] [val/loss_avg_len_2048: 5.575] [val/perplexity_len_2048: 263.717] [val/loss_avg_len_1024: 5.597] [val/perplexity_len_1024: 269.670] [val/loss_avg_len_512: 5.632] [val/perplexity_len_512: 279.145]
183
+ [2025-10-28 03:59:14][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 2:10:47] [ETA: 1:04:25] [loss: 5.548] [tokens/s: 177291.556] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
184
+ [2025-10-28 04:00:22][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 2:11:55] [ETA: 1:02:04] [loss: 5.566] [tokens/s: 193892.423] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
185
+ [2025-10-28 04:00:22][train:194][INFO] Running validation...
186
+ [2025-10-28 04:02:02][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 7915.051] [val/train_update_time: 4593.218] [val/loss: 5.566] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.562] [val/val_tokens_per_second: 407309.717] [val/loss_avg_len_2048: 5.566] [val/perplexity_len_2048: 261.397] [val/loss_avg_len_1024: 5.589] [val/perplexity_len_1024: 267.359] [val/loss_avg_len_512: 5.624] [val/perplexity_len_512: 276.891]
187
+ [2025-10-28 04:03:10][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 2:14:43] [ETA: 1:00:31] [loss: 5.576] [tokens/s: 177245.079] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
188
+ [2025-10-28 04:04:17][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 2:15:50] [ETA: 0:58:13] [loss: 5.572] [tokens/s: 193982.886] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
189
+ [2025-10-28 04:04:17][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8150.835] [train_eval/train_update_time: 4728.230] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.567] [train_eval/perplexity_len_2048: 261.654] [train_eval/loss_avg_len_1024: 5.587] [train_eval/perplexity_len_1024: 266.970] [train_eval/loss_avg_len_512: 5.621] [train_eval/perplexity_len_512: 276.270]
190
+ [2025-10-28 04:04:17][train:194][INFO] Running validation...
191
+ [2025-10-28 04:05:57][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 8150.835] [val/train_update_time: 4728.230] [val/loss: 5.558] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.887] [val/val_tokens_per_second: 410062.622] [val/loss_avg_len_2048: 5.558] [val/perplexity_len_2048: 259.375] [val/loss_avg_len_1024: 5.581] [val/perplexity_len_1024: 265.356] [val/loss_avg_len_512: 5.616] [val/perplexity_len_512: 274.916]
192
+ [2025-10-28 04:05:57][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt...
193
+ [2025-10-28 04:05:58][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt.
194
+ [2025-10-28 04:05:58][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.542]
195
+ [2025-10-28 04:07:06][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:18:38] [ETA: 0:56:37] [loss: 5.558] [tokens/s: 177340.233] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
196
+ [2025-10-28 04:08:13][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:19:46] [ETA: 0:54:21] [loss: 5.551] [tokens/s: 193920.123] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
197
+ [2025-10-28 04:08:13][train:194][INFO] Running validation...
198
+ [2025-10-28 04:09:53][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 8386.480] [val/train_update_time: 4863.236] [val/loss: 5.550] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.086] [val/val_tokens_per_second: 409249.749] [val/loss_avg_len_2048: 5.550] [val/perplexity_len_2048: 257.289] [val/loss_avg_len_1024: 5.573] [val/perplexity_len_1024: 263.280] [val/loss_avg_len_512: 5.609] [val/perplexity_len_512: 272.875]
199
+ [2025-10-28 04:11:01][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:22:34] [ETA: 0:52:43] [loss: 5.573] [tokens/s: 177341.769] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
200
+ [2025-10-28 04:12:08][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:23:41] [ETA: 0:50:29] [loss: 5.564] [tokens/s: 193966.117] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
201
+ [2025-10-28 04:12:08][train:194][INFO] Running validation...
202
+ [2025-10-28 04:13:49][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 8621.792] [val/train_update_time: 4998.251] [val/loss: 5.543] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.185] [val/val_tokens_per_second: 408845.489] [val/loss_avg_len_2048: 5.543] [val/perplexity_len_2048: 255.557] [val/loss_avg_len_1024: 5.567] [val/perplexity_len_1024: 261.566] [val/loss_avg_len_512: 5.603] [val/perplexity_len_512: 271.196]
203
+ [2025-10-28 04:14:56][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:26:29] [ETA: 0:48:49] [loss: 5.561] [tokens/s: 177362.861] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
204
+ [2025-10-28 04:14:56][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8789.584] [train_eval/train_update_time: 5065.749] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.549] [train_eval/perplexity_len_2048: 256.889] [train_eval/loss_avg_len_1024: 5.570] [train_eval/perplexity_len_1024: 262.327] [train_eval/loss_avg_len_512: 5.604] [train_eval/perplexity_len_512: 271.396]
205
+ [2025-10-28 04:16:04][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:27:37] [ETA: 0:46:37] [loss: 5.501] [tokens/s: 193929.065] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
206
+ [2025-10-28 04:16:04][train:194][INFO] Running validation...
207
+ [2025-10-28 04:17:44][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 8857.200] [val/train_update_time: 5133.254] [val/loss: 5.538] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.439] [val/val_tokens_per_second: 407810.770] [val/loss_avg_len_2048: 5.538] [val/perplexity_len_2048: 254.103] [val/loss_avg_len_1024: 5.561] [val/perplexity_len_1024: 260.106] [val/loss_avg_len_512: 5.598] [val/perplexity_len_512: 269.763]
208
+ [2025-10-28 04:18:52][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:30:25] [ETA: 0:44:55] [loss: 5.579] [tokens/s: 177295.539] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
209
+ [2025-10-28 04:19:59][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:31:32] [ETA: 0:42:44] [loss: 5.490] [tokens/s: 193947.109] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
210
+ [2025-10-28 04:19:59][train:194][INFO] Running validation...
211
+ [2025-10-28 04:21:40][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 9092.876] [val/train_update_time: 5268.263] [val/loss: 5.532] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.486] [val/val_tokens_per_second: 407619.115] [val/loss_avg_len_2048: 5.532] [val/perplexity_len_2048: 252.762] [val/loss_avg_len_1024: 5.556] [val/perplexity_len_1024: 258.793] [val/loss_avg_len_512: 5.593] [val/perplexity_len_512: 268.502]
212
+ [2025-10-28 04:22:48][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:34:20] [ETA: 0:41:01] [loss: 5.528] [tokens/s: 177302.413] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
213
+ [2025-10-28 04:23:55][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:35:28] [ETA: 0:38:52] [loss: 5.501] [tokens/s: 193938.822] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
214
+ [2025-10-28 04:23:55][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9328.580] [train_eval/train_update_time: 5403.267] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.534] [train_eval/perplexity_len_2048: 253.146] [train_eval/loss_avg_len_1024: 5.557] [train_eval/perplexity_len_1024: 259.015] [train_eval/loss_avg_len_512: 5.591] [train_eval/perplexity_len_512: 267.937]
215
+ [2025-10-28 04:23:55][train:194][INFO] Running validation...
216
+ [2025-10-28 04:25:36][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 9328.580] [val/train_update_time: 5403.267] [val/loss: 5.528] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.730] [val/val_tokens_per_second: 406631.809] [val/loss_avg_len_2048: 5.528] [val/perplexity_len_2048: 251.598] [val/loss_avg_len_1024: 5.552] [val/perplexity_len_1024: 257.636] [val/loss_avg_len_512: 5.589] [val/perplexity_len_512: 267.351]
217
+ [2025-10-28 04:25:36][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt...
218
+ [2025-10-28 04:25:36][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt.
219
+ [2025-10-28 04:25:36][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.541]
220
+ [2025-10-28 04:26:44][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:38:17] [ETA: 0:37:07] [loss: 5.472] [tokens/s: 177175.592] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
221
+ [2025-10-28 04:27:52][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:39:25] [ETA: 0:34:59] [loss: 5.507] [tokens/s: 193722.586] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
222
+ [2025-10-28 04:27:52][train:194][INFO] Running validation...
223
+ [2025-10-28 04:29:32][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 9565.069] [val/train_update_time: 5538.269] [val/loss: 5.524] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.263] [val/val_tokens_per_second: 408527.292] [val/loss_avg_len_2048: 5.524] [val/perplexity_len_2048: 250.618] [val/loss_avg_len_1024: 5.548] [val/perplexity_len_1024: 256.657] [val/loss_avg_len_512: 5.585] [val/perplexity_len_512: 266.386]
224
+ [2025-10-28 04:30:40][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:42:12] [ETA: 0:33:13] [loss: 5.533] [tokens/s: 177147.677] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
225
+ [2025-10-28 04:31:47][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:43:20] [ETA: 0:31:06] [loss: 5.483] [tokens/s: 193706.192] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
226
+ [2025-10-28 04:31:47][train:194][INFO] Running validation...
227
+ [2025-10-28 04:33:28][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 9800.571] [val/train_update_time: 5673.296] [val/loss: 5.521] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.787] [val/val_tokens_per_second: 406402.771] [val/loss_avg_len_2048: 5.521] [val/perplexity_len_2048: 249.779] [val/loss_avg_len_1024: 5.544] [val/perplexity_len_1024: 255.825] [val/loss_avg_len_512: 5.582] [val/perplexity_len_512: 265.578]
228
+ [2025-10-28 04:34:36][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:46:08] [ETA: 0:29:19] [loss: 5.566] [tokens/s: 177052.208] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
229
+ [2025-10-28 04:34:36][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9968.994] [train_eval/train_update_time: 5740.817] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.516] [train_eval/perplexity_len_2048: 248.681] [train_eval/loss_avg_len_1024: 5.533] [train_eval/perplexity_len_1024: 252.830] [train_eval/loss_avg_len_512: 5.568] [train_eval/perplexity_len_512: 261.814]
230
+ [2025-10-28 04:35:43][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:47:16] [ETA: 0:27:13] [loss: 5.534] [tokens/s: 193639.040] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
231
+ [2025-10-28 04:35:43][train:194][INFO] Running validation...
232
+ [2025-10-28 04:37:23][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 10036.600] [val/train_update_time: 5808.321] [val/loss: 5.518] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.266] [val/val_tokens_per_second: 408513.536] [val/loss_avg_len_2048: 5.518] [val/perplexity_len_2048: 249.091] [val/loss_avg_len_1024: 5.542] [val/perplexity_len_1024: 255.147] [val/loss_avg_len_512: 5.579] [val/perplexity_len_512: 264.910]
233
+ [2025-10-28 04:38:31][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:50:04] [ETA: 0:25:24] [loss: 5.472] [tokens/s: 177078.559] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
234
+ [2025-10-28 04:39:39][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:51:12] [ETA: 0:23:20] [loss: 5.504] [tokens/s: 193683.764] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
235
+ [2025-10-28 04:39:39][train:194][INFO] Running validation...
236
+ [2025-10-28 04:41:19][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 10272.080] [val/train_update_time: 5943.330] [val/loss: 5.516] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.354] [val/val_tokens_per_second: 408153.153] [val/loss_avg_len_2048: 5.516] [val/perplexity_len_2048: 248.572] [val/loss_avg_len_1024: 5.540] [val/perplexity_len_1024: 254.629] [val/loss_avg_len_512: 5.577] [val/perplexity_len_512: 264.398]
237
+ [2025-10-28 04:42:27][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:54:00] [ETA: 0:21:30] [loss: 5.567] [tokens/s: 177102.242] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
238
+ [2025-10-28 04:43:34][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:55:07] [ETA: 0:19:27] [loss: 5.464] [tokens/s: 193850.283] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
239
+ [2025-10-28 04:43:34][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10507.650] [train_eval/train_update_time: 6078.328] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.516] [train_eval/perplexity_len_2048: 248.575] [train_eval/loss_avg_len_1024: 5.536] [train_eval/perplexity_len_1024: 253.742] [train_eval/loss_avg_len_512: 5.572] [train_eval/perplexity_len_512: 262.860]
240
+ [2025-10-28 04:43:34][train:194][INFO] Running validation...
241
+ [2025-10-28 04:45:15][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 10507.650] [val/train_update_time: 6078.328] [val/loss: 5.514] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.790] [val/val_tokens_per_second: 406389.591] [val/loss_avg_len_2048: 5.514] [val/perplexity_len_2048: 248.160] [val/loss_avg_len_1024: 5.538] [val/perplexity_len_1024: 254.220] [val/loss_avg_len_512: 5.576] [val/perplexity_len_512: 264.001]
242
+ [2025-10-28 04:45:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt...
243
+ [2025-10-28 04:45:16][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt.
244
+ [2025-10-28 04:45:16][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.573]
245
+ [2025-10-28 04:46:55][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:58:27] [ETA: 0:17:39] [loss: 5.510] [tokens/s: 172480.392] [batches/s: 0.082] [MFU: 0.000] [TFLOPS: 0.000]
246
+ [2025-10-28 04:50:06][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 3:01:39] [ETA: 0:15:47] [loss: 5.545] [tokens/s: 169208.921] [batches/s: 0.081] [MFU: 0.000] [TFLOPS: 0.000]
247
+ [2025-10-28 04:50:06][train:194][INFO] Running validation...
248
+ [2025-10-28 04:54:23][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 10899.097] [val/train_update_time: 6367.991] [val/loss: 5.513] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 256.654] [val/val_tokens_per_second: 159592.450] [val/loss_avg_len_2048: 5.513] [val/perplexity_len_2048: 247.864] [val/loss_avg_len_1024: 5.537] [val/perplexity_len_1024: 253.923] [val/loss_avg_len_512: 5.575] [val/perplexity_len_512: 263.702]
249
+ [2025-10-28 04:56:33][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 3:08:05] [ETA: 0:14:09] [loss: 5.524] [tokens/s: 134270.643] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
250
+ [2025-10-28 04:59:38][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 3:11:11] [ETA: 0:12:12] [loss: 5.470] [tokens/s: 132805.321] [batches/s: 0.063] [MFU: 0.000] [TFLOPS: 0.000]
251
+ [2025-10-28 04:59:38][train:194][INFO] Running validation...
252
+ [2025-10-28 05:04:08][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 11471.464] [val/train_update_time: 6683.227] [val/loss: 5.512] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 269.618] [val/val_tokens_per_second: 151918.872] [val/loss_avg_len_2048: 5.512] [val/perplexity_len_2048: 247.668] [val/loss_avg_len_1024: 5.536] [val/perplexity_len_1024: 253.726] [val/loss_avg_len_512: 5.574] [val/perplexity_len_512: 263.505]
253
+ [2025-10-28 05:06:18][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 3:17:51] [ETA: 0:10:24] [loss: 5.509] [tokens/s: 109514.305] [batches/s: 0.052] [MFU: 0.000] [TFLOPS: 0.000]
254
+ [2025-10-28 05:06:18][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 11871.571] [train_eval/train_update_time: 6813.441] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.506] [train_eval/perplexity_len_2048: 246.222] [train_eval/loss_avg_len_1024: 5.530] [train_eval/perplexity_len_1024: 252.035] [train_eval/loss_avg_len_512: 5.564] [train_eval/perplexity_len_512: 260.951]
255
+ [2025-10-28 05:09:01][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 3:20:34] [ETA: 0:08:21] [loss: 5.492] [tokens/s: 109797.220] [batches/s: 0.052] [MFU: 0.000] [TFLOPS: 0.000]
256
+ [2025-10-28 05:09:01][train:194][INFO] Running validation...
257
+ [2025-10-28 05:13:46][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 12034.565] [val/train_update_time: 6976.259] [val/loss: 5.512] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 284.595] [val/val_tokens_per_second: 143923.791] [val/loss_avg_len_2048: 5.512] [val/perplexity_len_2048: 247.558] [val/loss_avg_len_1024: 5.536] [val/perplexity_len_1024: 253.618] [val/loss_avg_len_512: 5.574] [val/perplexity_len_512: 263.405]
258
+ [2025-10-28 05:15:56][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 3:27:29] [ETA: 0:06:25] [loss: 5.528] [tokens/s: 92744.415] [batches/s: 0.044] [MFU: 0.000] [TFLOPS: 0.000]
259
+ [2025-10-28 05:18:13][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 3:29:46] [ETA: 0:04:16] [loss: 5.510] [tokens/s: 94077.610] [batches/s: 0.045] [MFU: 0.000] [TFLOPS: 0.000]
260
+ [2025-10-28 05:18:13][train:194][INFO] Running validation...
261
+ [2025-10-28 05:23:14][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 12586.090] [val/train_update_time: 7242.766] [val/loss: 5.511] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 301.208] [val/val_tokens_per_second: 135985.772] [val/loss_avg_len_2048: 5.511] [val/perplexity_len_2048: 247.507] [val/loss_avg_len_1024: 5.536] [val/perplexity_len_1024: 253.567] [val/loss_avg_len_512: 5.573] [val/perplexity_len_512: 263.350]
262
+ [2025-10-28 05:23:14][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"step": 209715200, "checkpoint/checkpoint_time": 0.4586985750356689}
2
- {"step": 419430400, "checkpoint/checkpoint_time": 0.452107127988711}
3
- {"step": 629145600, "checkpoint/checkpoint_time": 0.45928702398668975}
4
- {"step": 838860800, "checkpoint/checkpoint_time": 0.4599621079978533}
5
- {"step": 1048576000, "checkpoint/checkpoint_time": 0.4533659809967503}
6
- {"step": 1258291200, "checkpoint/checkpoint_time": 0.4462293910328299}
7
- {"step": 1468006400, "checkpoint/checkpoint_time": 0.4543691629660316}
8
- {"step": 1677721600, "checkpoint/checkpoint_time": 0.5127520990208723}
9
- {"step": 1887436800, "checkpoint/checkpoint_time": 0.4454628659877926}
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 0.4554534360067919}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 0.44249288097489625}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 0.42379301704932004}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 0.4248116289963946}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 0.43460876401513815}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 0.5474804400000721}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 0.5417233019834384}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 0.5410387290176004}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 0.5728158770361915}
metrics/jsonlines/norm.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/throughput.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl CHANGED
@@ -1,98 +1,98 @@
1
- {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 59.67119211301906, "train/update_time": 59.47943267202936, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346482276916504}
2
- {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 115.56162341398885, "train/update_time": 115.25218190002488, "train/lr": 0.0009997960964140947, "train/loss": 8.126625061035156, "train/global_grad_norm": 0.962837278842926}
3
- {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 261.80084426596295, "train/update_time": 171.00324709707638, "train/lr": 0.0009990914580222257, "train/loss": 7.519778728485107, "train/global_grad_norm": 0.5695855021476746}
4
- {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 317.6585605319706, "train/update_time": 226.74485654599266, "train/lr": 0.0009978842768382998, "train/loss": 7.193304061889648, "train/global_grad_norm": 0.4217643439769745}
5
- {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 464.0044742010068, "train/update_time": 282.4727703850367, "train/lr": 0.0009961757683914405, "train/loss": 6.9472150802612305, "train/global_grad_norm": 0.26760002970695496}
6
- {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 519.8498190940008, "train/update_time": 338.1969051870401, "train/lr": 0.00099396765300483, "train/loss": 6.68041467666626, "train/global_grad_norm": 0.31579363346099854}
7
- {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 665.9225179139758, "train/update_time": 393.9277793511865, "train/lr": 0.0009912621540634887, "train/loss": 6.480125904083252, "train/global_grad_norm": 0.26012396812438965}
8
- {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 721.7491259319941, "train/update_time": 449.6537483881111, "train/lr": 0.000988061995775515, "train/loss": 6.281551837921143, "train/global_grad_norm": 0.39679110050201416}
9
- {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 867.7238940689713, "train/update_time": 505.3731428361498, "train/lr": 0.0009843704004290394, "train/loss": 6.122912406921387, "train/global_grad_norm": 1.23171067237854}
10
- {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 923.5610087590176, "train/update_time": 561.0979154942906, "train/lr": 0.0009801910851476522, "train/loss": 5.9722723960876465, "train/global_grad_norm": 0.3574962913990021}
11
- {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1070.0474769069697, "train/update_time": 616.8178668163018, "train/lr": 0.0009755282581475768, "train/loss": 5.849911212921143, "train/global_grad_norm": 0.38126564025878906}
12
- {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1125.8911167309852, "train/update_time": 672.5490376223461, "train/lr": 0.0009703866145003512, "train/loss": 5.7178874015808105, "train/global_grad_norm": 0.6952179670333862}
13
- {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1271.9843903059955, "train/update_time": 728.276082833414, "train/lr": 0.0009647713314052896, "train/loss": 5.644232749938965, "train/global_grad_norm": 0.34717857837677}
14
- {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1327.8194525539875, "train/update_time": 784.0098267712165, "train/lr": 0.0009586880629764817, "train/loss": 5.570384502410889, "train/global_grad_norm": 0.6765910983085632}
15
- {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1473.913084711996, "train/update_time": 839.7419353383593, "train/lr": 0.0009521429345495787, "train/loss": 5.444611072540283, "train/global_grad_norm": 0.4169935882091522}
16
- {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1529.7566971820197, "train/update_time": 895.4798201125232, "train/lr": 0.0009451425365140996, "train/loss": 5.40510368347168, "train/global_grad_norm": 0.709697961807251}
17
- {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1675.8973924720194, "train/update_time": 951.2102684524143, "train/lr": 0.000937693917677468, "train/loss": 5.298379421234131, "train/global_grad_norm": 0.35993462800979614}
18
- {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1731.7409806579817, "train/update_time": 1006.941321704362, "train/lr": 0.0009298045781674596, "train/loss": 5.267183303833008, "train/global_grad_norm": 0.45855849981307983}
19
- {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1877.79561357497, "train/update_time": 1062.6691502483445, "train/lr": 0.0009214824618802108, "train/loss": 5.240725994110107, "train/global_grad_norm": 0.45877301692962646}
20
- {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1933.6392927800189, "train/update_time": 1118.3960400532233, "train/lr": 0.000912735948481387, "train/loss": 5.148595809936523, "train/global_grad_norm": 0.5232999920845032}
21
- {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2080.2760781620163, "train/update_time": 1174.1330189242726, "train/lr": 0.0009035738449685707, "train/loss": 5.102267742156982, "train/global_grad_norm": 0.40673965215682983}
22
- {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2136.1286090469803, "train/update_time": 1229.8719967252691, "train/lr": 0.0008940053768033609, "train/loss": 5.072765827178955, "train/global_grad_norm": 0.540256679058075}
23
- {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2282.3240791389835, "train/update_time": 1285.6013174692634, "train/lr": 0.0008840401786221159, "train/loss": 5.013406276702881, "train/global_grad_norm": 0.4202441871166229}
24
- {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2338.156839519972, "train/update_time": 1341.322897736216, "train/lr": 0.0008736882845346905, "train/loss": 4.965211868286133, "train/global_grad_norm": 0.5850781798362732}
25
- {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2484.2231900609913, "train/update_time": 1397.0756743992679, "train/lr": 0.0008629601180209381, "train/loss": 4.961833477020264, "train/global_grad_norm": 0.6340895295143127}
26
- {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2540.0900395850185, "train/update_time": 1452.8315300212707, "train/lr": 0.0008518664814351503, "train/loss": 4.912302017211914, "train/global_grad_norm": 0.5044277310371399}
27
- {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2686.6876707019983, "train/update_time": 1508.5666796893347, "train/lr": 0.0008404185451290017, "train/loss": 4.897612571716309, "train/global_grad_norm": 0.4688912034034729}
28
- {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2742.5622889249935, "train/update_time": 1564.3145829213317, "train/lr": 0.0008286278362039527, "train/loss": 4.848834037780762, "train/global_grad_norm": 0.6365319490432739}
29
- {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2889.1227126879967, "train/update_time": 1620.060068657389, "train/lr": 0.0008165062269044352, "train/loss": 4.8169732093811035, "train/global_grad_norm": 0.4134746789932251}
30
- {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 2944.979424642981, "train/update_time": 1675.8085315313656, "train/lr": 0.0008040659226635089, "train/loss": 4.79654598236084, "train/global_grad_norm": 0.5643511414527893}
31
- {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3091.6305087410146, "train/update_time": 1731.5482600294054, "train/lr": 0.0007913194498130252, "train/loss": 4.810868740081787, "train/global_grad_norm": 0.47013285756111145}
32
- {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3147.4849796229973, "train/update_time": 1787.300771905575, "train/lr": 0.000778279642970672, "train/loss": 4.74250602722168, "train/global_grad_norm": 0.5142323970794678}
33
- {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3293.557014766964, "train/update_time": 1843.0341175765498, "train/lr": 0.0007649596321166025, "train/loss": 4.759753704071045, "train/global_grad_norm": 0.5028547644615173}
34
- {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3349.424384585989, "train/update_time": 1898.7890577405924, "train/lr": 0.0007513728293726579, "train/loss": 4.724730491638184, "train/global_grad_norm": 0.5188063383102417}
35
- {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3496.0525263110176, "train/update_time": 1954.5468697096221, "train/lr": 0.0007375329154974975, "train/loss": 4.704092502593994, "train/global_grad_norm": 0.4179239571094513}
36
- {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 3551.9233703140053, "train/update_time": 2010.2989615525585, "train/lr": 0.0007234538261112341, "train/loss": 4.630825042724609, "train/global_grad_norm": 0.4399227201938629}
37
- {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 3698.38593041501, "train/update_time": 2066.043420936505, "train/lr": 0.0007091497376634464, "train/loss": 4.655548095703125, "train/global_grad_norm": 0.45650508999824524}
38
- {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 3754.2459046120057, "train/update_time": 2121.7969342375873, "train/lr": 0.0006946350531586958, "train/loss": 4.63443660736084, "train/global_grad_norm": 0.4673406481742859}
39
- {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 3902.067099667969, "train/update_time": 2177.5441309445887, "train/lr": 0.0006799243876539214, "train/loss": 4.639521598815918, "train/global_grad_norm": 0.5377744436264038}
40
- {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 3957.9235017200117, "train/update_time": 2233.291398033558, "train/lr": 0.0006650325535423166, "train/loss": 4.547835826873779, "train/global_grad_norm": 0.5047109127044678}
41
- {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4104.85487911501, "train/update_time": 2289.03639949864, "train/lr": 0.0006499745456385053, "train/loss": 4.572357654571533, "train/global_grad_norm": 0.6879011392593384}
42
- {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4160.708882857987, "train/update_time": 2344.7830602055765, "train/lr": 0.0006347655260800339, "train/loss": 4.565418720245361, "train/global_grad_norm": 0.428315132856369}
43
- {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4307.030373459973, "train/update_time": 2400.5389171724673, "train/lr": 0.0006194208090603844, "train/loss": 4.560233116149902, "train/global_grad_norm": 0.45447441935539246}
44
- {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4362.870210377965, "train/update_time": 2456.286583611334, "train/lr": 0.0006039558454088796, "train/loss": 4.5870771408081055, "train/global_grad_norm": 0.7089611887931824}
45
- {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 4509.209549701016, "train/update_time": 2512.043382478296, "train/lr": 0.0005883862070330078, "train/loss": 4.5283427238464355, "train/global_grad_norm": 0.4208521842956543}
46
- {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 4565.060955744993, "train/update_time": 2567.792047406314, "train/lr": 0.0005727275712388317, "train/loss": 4.496908187866211, "train/global_grad_norm": 0.6397818922996521}
47
- {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 4711.463092761987, "train/update_time": 2623.54389296734, "train/lr": 0.0005569957049452703, "train/loss": 4.518903732299805, "train/global_grad_norm": 0.5339348316192627}
48
- {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 4767.3171031199745, "train/update_time": 2679.296893617313, "train/lr": 0.0005412064488081482, "train/loss": 4.495401382446289, "train/global_grad_norm": 0.47157326340675354}
49
- {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 4913.884267903981, "train/update_time": 2735.0481502541807, "train/lr": 0.0005253757012699972, "train/loss": 4.490736484527588, "train/global_grad_norm": 0.5239655375480652}
50
- {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 4969.727964510967, "train/update_time": 2790.797476610227, "train/lr": 0.0005095194025516734, "train/loss": 4.4643659591674805, "train/global_grad_norm": 0.5247243642807007}
51
- {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5116.911609567003, "train/update_time": 2846.5413487541373, "train/lr": 0.0004936535186019053, "train/loss": 4.463287353515625, "train/global_grad_norm": 0.4336317479610443}
52
- {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5172.765950003988, "train/update_time": 2902.285505968146, "train/lr": 0.00047779402502093696, "train/loss": 4.457107067108154, "train/global_grad_norm": 0.6947441101074219}
53
- {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 5319.3806196419755, "train/update_time": 2958.0443075241055, "train/lr": 0.0004619568909744525, "train/loss": 4.4143757820129395, "train/global_grad_norm": 0.45258453488349915}
54
- {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 5375.2188087760005, "train/update_time": 3013.786068893096, "train/lr": 0.00044615806311398067, "train/loss": 4.424180030822754, "train/global_grad_norm": 0.4154273271560669}
55
- {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 5521.543946499005, "train/update_time": 3069.52293490601, "train/lr": 0.0004304134495199673, "train/loss": 4.3700270652771, "train/global_grad_norm": 0.3898273706436157}
56
- {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 5577.396705480001, "train/update_time": 3125.264993761957, "train/lr": 0.0004147389036836882, "train/loss": 4.413632869720459, "train/global_grad_norm": 0.5425747036933899}
57
- {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 5723.710059185978, "train/update_time": 3181.0052098479937, "train/lr": 0.0003991502085441259, "train/loss": 4.3622026443481445, "train/global_grad_norm": 0.45439326763153076}
58
- {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 5779.562335913011, "train/update_time": 3236.749040101946, "train/lr": 0.0003836630605958888, "train/loss": 4.410221576690674, "train/global_grad_norm": 0.4280547499656677}
59
- {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 5925.651880743972, "train/update_time": 3292.4961628898745, "train/lr": 0.00036829305408417155, "train/loss": 4.391324520111084, "train/global_grad_norm": 0.42996275424957275}
60
- {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 5981.512405745976, "train/update_time": 3348.2465934828506, "train/lr": 0.000353055665302672, "train/loss": 4.390552997589111, "train/global_grad_norm": 0.6177342534065247}
61
- {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6128.563732119976, "train/update_time": 3403.9983463209355, "train/lr": 0.0003379662370102746, "train/loss": 4.355296611785889, "train/global_grad_norm": 0.445901095867157}
62
- {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6184.421642497007, "train/update_time": 3459.7488345169113, "train/lr": 0.00032303996298219405, "train/loss": 4.329927444458008, "train/global_grad_norm": 0.4848615527153015}
63
- {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 6330.95437408, "train/update_time": 3515.502175346832, "train/lr": 0.00030829187271113034, "train/loss": 4.3402838706970215, "train/global_grad_norm": 0.42915236949920654}
64
- {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 6386.8215373010025, "train/update_time": 3571.2574781817966, "train/lr": 0.0002937368162738445, "train/loss": 4.330328464508057, "train/global_grad_norm": 0.44172123074531555}
65
- {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 6532.972345999966, "train/update_time": 3627.0106727198, "train/lr": 0.0002793894493783894, "train/loss": 4.3035969734191895, "train/global_grad_norm": 0.4424532651901245}
66
- {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 6588.8636812510085, "train/update_time": 3682.7761907348176, "train/lr": 0.00026526421860705474, "train/loss": 4.325634956359863, "train/global_grad_norm": 0.4446793496608734}
67
- {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 6734.952371816966, "train/update_time": 3738.5206979417126, "train/lr": 0.0002513753468698824, "train/loss": 4.269580841064453, "train/global_grad_norm": 0.4529637098312378}
68
- {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 6790.794109267998, "train/update_time": 3794.261904676736, "train/lr": 0.00023773681908340283, "train/loss": 4.283663749694824, "train/global_grad_norm": 0.445527583360672}
69
- {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 6936.7826500849915, "train/update_time": 3850.002778201713, "train/lr": 0.00022436236808900823, "train/loss": 4.284794807434082, "train/global_grad_norm": 0.37836042046546936}
70
- {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 6992.639220207988, "train/update_time": 3905.7538149688044, "train/lr": 0.00021126546082514682, "train/loss": 4.279749870300293, "train/global_grad_norm": 0.3362836241722107}
71
- {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 7139.175099594984, "train/update_time": 3961.498215056723, "train/lr": 0.00019845928476725522, "train/loss": 4.276471138000488, "train/global_grad_norm": 0.3601376414299011}
72
- {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 7195.047693797969, "train/update_time": 4017.255656591733, "train/lr": 0.0001859567346490913, "train/loss": 4.2520365715026855, "train/global_grad_norm": 0.3764491081237793}
73
- {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 7341.3063276839675, "train/update_time": 4072.9893345796154, "train/lr": 0.00017377039947882782, "train/loss": 4.269729137420654, "train/global_grad_norm": 0.3962520360946655}
74
- {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 7397.135954166006, "train/update_time": 4128.71436746855, "train/lr": 0.00016191254986299043, "train/loss": 4.254550933837891, "train/global_grad_norm": 0.357697457075119}
75
- {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 7543.42272252898, "train/update_time": 4184.450604122656, "train/lr": 0.00015039512565099468, "train/loss": 4.237186431884766, "train/global_grad_norm": 0.34904253482818604}
76
- {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 7599.255373338994, "train/update_time": 4240.1828456086805, "train/lr": 0.00013922972391273224, "train/loss": 4.198566436767578, "train/global_grad_norm": 0.3618724048137665}
77
- {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 7745.557092848001, "train/update_time": 4295.924490743666, "train/lr": 0.00012842758726130281, "train/loss": 4.263113975524902, "train/global_grad_norm": 0.3145442306995392}
78
- {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 7801.41744280397, "train/update_time": 4351.678691691719, "train/lr": 0.00011799959253265679, "train/loss": 4.1848530769348145, "train/global_grad_norm": 0.3598962128162384}
79
- {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 7947.527445982967, "train/update_time": 4407.426374787698, "train/lr": 0.00010795623983354214, "train/loss": 4.2140374183654785, "train/global_grad_norm": 0.3123509883880615}
80
- {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 8003.3749305050005, "train/update_time": 4463.172267011658, "train/lr": 9.830764196878872e-05, "train/loss": 4.1917405128479, "train/global_grad_norm": 0.31881648302078247}
81
- {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 8150.418857850018, "train/update_time": 4518.926882733707, "train/lr": 8.906351425856951e-05, "train/loss": 4.167685508728027, "train/global_grad_norm": 0.29552316665649414}
82
- {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 8206.275687849964, "train/update_time": 4574.681749307667, "train/lr": 8.02331647558977e-05, "train/loss": 4.179322242736816, "train/global_grad_norm": 0.281093567609787}
83
- {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 8352.533770180016, "train/update_time": 4630.437151424645, "train/lr": 7.182548487420554e-05, "train/loss": 4.211834907531738, "train/global_grad_norm": 0.29659828543663025}
84
- {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 8408.397738418018, "train/update_time": 4686.193155970657, "train/lr": 6.384894043444556e-05, "train/loss": 4.1608757972717285, "train/global_grad_norm": 0.29815351963043213}
85
- {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 8554.563296968001, "train/update_time": 4741.9453661507, "train/lr": 5.6311563140726166e-05, "train/loss": 4.230018138885498, "train/global_grad_norm": 0.2653578221797943}
86
- {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 8610.412180389976, "train/update_time": 4797.692331016588, "train/lr": 4.922094249306547e-05, "train/loss": 4.209297180175781, "train/global_grad_norm": 0.2605638802051544}
87
- {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 8756.645623472985, "train/update_time": 4853.441611350689, "train/lr": 4.2584218145409916e-05, "train/loss": 4.1548752784729, "train/global_grad_norm": 0.2570478022098541}
88
- {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 8812.488870778994, "train/update_time": 4909.191867132671, "train/lr": 3.6408072716606236e-05, "train/loss": 4.172904968261719, "train/global_grad_norm": 0.2740459740161896}
89
- {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 8958.696731061966, "train/update_time": 4964.9637988246395, "train/lr": 3.069872506157217e-05, "train/loss": 4.228043079376221, "train/global_grad_norm": 0.25757673382759094}
90
- {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 9014.539005896018, "train/update_time": 5020.712582220614, "train/lr": 2.5461924009435368e-05, "train/loss": 4.143199920654297, "train/global_grad_norm": 0.2552241086959839}
91
- {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 9161.434383540007, "train/update_time": 5076.4567962596775, "train/lr": 2.0702942574950812e-05, "train/loss": 4.177771091461182, "train/global_grad_norm": 0.24890665709972382}
92
- {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 9217.298647498013, "train/update_time": 5132.212492840539, "train/lr": 1.642657264902142e-05, "train/loss": 4.206305027008057, "train/global_grad_norm": 0.23305842280387878}
93
- {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 9363.553384587984, "train/update_time": 5187.936918072519, "train/lr": 1.2637120173670358e-05, "train/loss": 4.190739154815674, "train/global_grad_norm": 0.22044338285923004}
94
- {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 9419.395047847007, "train/update_time": 5243.674068749533, "train/lr": 9.338400806321978e-06, "train/loss": 4.147926330566406, "train/global_grad_norm": 0.22512836754322052}
95
- {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 9565.561509469, "train/update_time": 5299.418035702605, "train/lr": 6.533736077758867e-06, "train/loss": 4.170260429382324, "train/global_grad_norm": 0.22401364147663116}
96
- {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 9621.420487532974, "train/update_time": 5355.161261588568, "train/lr": 4.2259500476214406e-06, "train/loss": 4.168946266174316, "train/global_grad_norm": 0.215094193816185}
97
- {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 9767.543393198983, "train/update_time": 5410.899587089545, "train/lr": 2.417366460819359e-06, "train/loss": 4.192867755889893, "train/global_grad_norm": 0.21194864809513092}
98
- {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 9823.39123856998, "train/update_time": 5466.646158660587, "train/lr": 1.1098064077174619e-06, "train/loss": 4.168134689331055, "train/global_grad_norm": 0.20849043130874634}
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 69.9843489350751, "train/update_time": 69.80232589994557, "train/lr": 0.0009000000000000001, "train/loss": 9.773597717285156, "train/global_grad_norm": 1.234387755393982}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 137.6325364280492, "train/update_time": 137.34443728171755, "train/lr": 0.0009997960964140947, "train/loss": 8.196192741394043, "train/global_grad_norm": 0.9726490378379822}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 306.0479498610366, "train/update_time": 204.88854807184543, "train/lr": 0.0009990914580222257, "train/loss": 7.71175479888916, "train/global_grad_norm": 0.40494367480278015}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 373.6941461900715, "train/update_time": 272.43619964295067, "train/lr": 0.0009978842768382998, "train/loss": 7.524440765380859, "train/global_grad_norm": 0.5722494721412659}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 541.7261677470524, "train/update_time": 339.9664637759561, "train/lr": 0.0009961757683914405, "train/loss": 7.3614912033081055, "train/global_grad_norm": 0.20995093882083893}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 609.3696456589969, "train/update_time": 407.4915405898355, "train/lr": 0.00099396765300483, "train/loss": 7.195133209228516, "train/global_grad_norm": 0.15112251043319702}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 776.8270515420008, "train/update_time": 474.9853536799783, "train/lr": 0.0009912621540634887, "train/loss": 7.097200393676758, "train/global_grad_norm": 0.3083263337612152}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 844.4406513640424, "train/update_time": 542.4893449847586, "train/lr": 0.000988061995775515, "train/loss": 6.98495626449585, "train/global_grad_norm": 0.2213425636291504}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 1012.7395372070605, "train/update_time": 610.0022338308627, "train/lr": 0.0009843704004290394, "train/loss": 6.863668441772461, "train/global_grad_norm": 0.3979201912879944}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1080.3595280270092, "train/update_time": 677.5131666237721, "train/lr": 0.0009801910851476522, "train/loss": 6.781008720397949, "train/global_grad_norm": 0.3312043845653534}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1248.9534793440253, "train/update_time": 745.0211975647835, "train/lr": 0.0009755282581475768, "train/loss": 6.697778701782227, "train/global_grad_norm": 0.3066418468952179}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1316.5735572939739, "train/update_time": 812.5169301189017, "train/lr": 0.0009703866145003512, "train/loss": 6.592831134796143, "train/global_grad_norm": 0.3352743089199066}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1485.0034688690212, "train/update_time": 880.0041609148029, "train/lr": 0.0009647713314052896, "train/loss": 6.561812877655029, "train/global_grad_norm": 0.4492341876029968}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1552.605858018971, "train/update_time": 947.4958640788682, "train/lr": 0.0009586880629764817, "train/loss": 6.504783630371094, "train/global_grad_norm": 0.2978050410747528}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1720.6480030510575, "train/update_time": 1014.9766604538308, "train/lr": 0.0009521429345495787, "train/loss": 6.405797481536865, "train/global_grad_norm": 0.5476596355438232}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1788.244720379007, "train/update_time": 1082.4689192509977, "train/lr": 0.0009451425365140996, "train/loss": 6.375470161437988, "train/global_grad_norm": 0.332520067691803}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1956.1968261280563, "train/update_time": 1149.9545339199249, "train/lr": 0.000937693917677468, "train/loss": 6.300417423248291, "train/global_grad_norm": 0.7200642824172974}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 2023.8116153230658, "train/update_time": 1217.4519352857023, "train/lr": 0.0009298045781674596, "train/loss": 6.269016265869141, "train/global_grad_norm": 0.29608145356178284}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2191.784361746977, "train/update_time": 1284.9417869256577, "train/lr": 0.0009214824618802108, "train/loss": 6.286498546600342, "train/global_grad_norm": 0.6701951026916504}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2259.3751640570117, "train/update_time": 1352.4327959185466, "train/lr": 0.000912735948481387, "train/loss": 6.202700614929199, "train/global_grad_norm": 0.309689998626709}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2427.812098467024, "train/update_time": 1419.9271084357752, "train/lr": 0.0009035738449685707, "train/loss": 6.156064510345459, "train/global_grad_norm": 0.29083284735679626}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2495.436698611011, "train/update_time": 1487.4421038717264, "train/lr": 0.0008940053768033609, "train/loss": 6.145916938781738, "train/global_grad_norm": 0.4619344472885132}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2663.784864959074, "train/update_time": 1554.9646949897287, "train/lr": 0.0008840401786221159, "train/loss": 6.098778247833252, "train/global_grad_norm": 0.38272926211357117}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2731.4342436430743, "train/update_time": 1622.4905407206388, "train/lr": 0.0008736882845346905, "train/loss": 6.057912349700928, "train/global_grad_norm": 0.46633926033973694}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2900.0770314050606, "train/update_time": 1690.0012620057678, "train/lr": 0.0008629601180209381, "train/loss": 6.051210880279541, "train/global_grad_norm": 0.3780326843261719}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2967.7298853070242, "train/update_time": 1757.5320481728995, "train/lr": 0.0008518664814351503, "train/loss": 6.018684387207031, "train/global_grad_norm": 0.4844052195549011}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3136.192894882057, "train/update_time": 1825.0568903158419, "train/lr": 0.0008404185451290017, "train/loss": 6.0103936195373535, "train/global_grad_norm": 0.3090081512928009}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3203.840811592061, "train/update_time": 1892.5856667858316, "train/lr": 0.0008286278362039527, "train/loss": 5.959850311279297, "train/global_grad_norm": 0.3670918345451355}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3371.6769031099975, "train/update_time": 1960.1043883458478, "train/lr": 0.0008165062269044352, "train/loss": 5.930871486663818, "train/global_grad_norm": 0.5652802586555481}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3439.312252484029, "train/update_time": 2027.637110557058, "train/lr": 0.0008040659226635089, "train/loss": 5.914786338806152, "train/global_grad_norm": 0.3531936705112457}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3607.989511896041, "train/update_time": 2095.1480140229687, "train/lr": 0.0007913194498130252, "train/loss": 5.9558491706848145, "train/global_grad_norm": 0.31624099612236023}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3675.6837699849857, "train/update_time": 2162.6913318177685, "train/lr": 0.000778279642970672, "train/loss": 5.875686168670654, "train/global_grad_norm": 0.45401549339294434}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3843.81031891203, "train/update_time": 2230.2011163331335, "train/lr": 0.0007649596321166025, "train/loss": 5.9156365394592285, "train/global_grad_norm": 0.3709852695465088}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3911.4375108770328, "train/update_time": 2297.7100352901034, "train/lr": 0.0007513728293726579, "train/loss": 5.878309726715088, "train/global_grad_norm": 0.3850482106208801}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 4079.354542599991, "train/update_time": 2365.248016706202, "train/lr": 0.0007375329154974975, "train/loss": 5.850311756134033, "train/global_grad_norm": 0.3051223158836365}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4147.001440979075, "train/update_time": 2432.7737647151807, "train/lr": 0.0007234538261112341, "train/loss": 5.786566734313965, "train/global_grad_norm": 0.3867356479167938}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4315.090891462984, "train/update_time": 2500.2843339033425, "train/lr": 0.0007091497376634464, "train/loss": 5.8186936378479, "train/global_grad_norm": 0.3156871199607849}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4382.723289367976, "train/update_time": 2567.801604798413, "train/lr": 0.0006946350531586958, "train/loss": 5.791891098022461, "train/global_grad_norm": 0.36183497309684753}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4550.521985811065, "train/update_time": 2635.3166619893163, "train/lr": 0.0006799243876539214, "train/loss": 5.8107523918151855, "train/global_grad_norm": 0.49777495861053467}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4618.154446462984, "train/update_time": 2702.834149704431, "train/lr": 0.0006650325535423166, "train/loss": 5.713278293609619, "train/global_grad_norm": 0.3702220022678375}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4786.292578452965, "train/update_time": 2770.3491651542718, "train/lr": 0.0006499745456385053, "train/loss": 5.73615026473999, "train/global_grad_norm": 0.4145315885543823}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4853.921031603008, "train/update_time": 2837.8658607284306, "train/lr": 0.0006347655260800339, "train/loss": 5.749305248260498, "train/global_grad_norm": 0.38909024000167847}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 5021.374068334, "train/update_time": 2905.373775637592, "train/lr": 0.0006194208090603844, "train/loss": 5.747371673583984, "train/global_grad_norm": 0.4385876953601837}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 5089.012363151065, "train/update_time": 2972.8906648436096, "train/lr": 0.0006039558454088796, "train/loss": 5.779524326324463, "train/global_grad_norm": 0.4157894253730774}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5256.981477431022, "train/update_time": 3040.4158616837813, "train/lr": 0.0005883862070330078, "train/loss": 5.7228264808654785, "train/global_grad_norm": 0.3932007849216461}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5324.629123619059, "train/update_time": 3107.9427982217167, "train/lr": 0.0005727275712388317, "train/loss": 5.6978583335876465, "train/global_grad_norm": 0.45659518241882324}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5492.093443386024, "train/update_time": 3175.472754184855, "train/lr": 0.0005569957049452703, "train/loss": 5.713237285614014, "train/global_grad_norm": 0.351923406124115}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5559.744059987017, "train/update_time": 3242.99941329984, "train/lr": 0.0005412064488081482, "train/loss": 5.698825836181641, "train/global_grad_norm": 0.591474175453186}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5727.743233548012, "train/update_time": 3310.5120258319657, "train/lr": 0.0005253757012699972, "train/loss": 5.690817356109619, "train/global_grad_norm": 0.28637227416038513}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5795.387424343033, "train/update_time": 3378.042265718919, "train/lr": 0.0005095194025516734, "train/loss": 5.679786682128906, "train/global_grad_norm": 0.34895920753479004}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5963.751827322063, "train/update_time": 3445.5543855928117, "train/lr": 0.0004936535186019053, "train/loss": 5.677963733673096, "train/global_grad_norm": 0.4498981237411499}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 6031.410010787076, "train/update_time": 3513.0946629439713, "train/lr": 0.00047779402502093696, "train/loss": 5.6720685958862305, "train/global_grad_norm": 0.3658086061477661}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6198.96821509907, "train/update_time": 3580.623477048124, "train/lr": 0.0004619568909744525, "train/loss": 5.642226696014404, "train/global_grad_norm": 0.36008358001708984}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6266.654250354972, "train/update_time": 3648.1618733102223, "train/lr": 0.00044615806311398067, "train/loss": 5.654401779174805, "train/global_grad_norm": 0.49052467942237854}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6434.099346054019, "train/update_time": 3715.6762271750486, "train/lr": 0.0004304134495199673, "train/loss": 5.601498126983643, "train/global_grad_norm": 0.36875054240226746}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6501.729716927046, "train/update_time": 3783.1945504179457, "train/lr": 0.0004147389036836882, "train/loss": 5.638054847717285, "train/global_grad_norm": 0.3628014028072357}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6669.622392687015, "train/update_time": 3850.7147646707017, "train/lr": 0.0003991502085441259, "train/loss": 5.593812465667725, "train/global_grad_norm": 0.40251973271369934}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6737.260854291031, "train/update_time": 3918.229618334677, "train/lr": 0.0003836630605958888, "train/loss": 5.651180267333984, "train/global_grad_norm": 0.41545599699020386}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6905.097716190037, "train/update_time": 3985.7243089615367, "train/lr": 0.00036829305408417155, "train/loss": 5.651818752288818, "train/global_grad_norm": 0.2855614423751831}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6972.698619629024, "train/update_time": 4053.2279500714503, "train/lr": 0.000353055665302672, "train/loss": 5.643282890319824, "train/global_grad_norm": 0.3338489830493927}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 7141.385740551981, "train/update_time": 4120.727246251656, "train/lr": 0.0003379662370102746, "train/loss": 5.617847442626953, "train/global_grad_norm": 0.33251237869262695}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 7208.982709518052, "train/update_time": 4188.228603066411, "train/lr": 0.00032303996298219405, "train/loss": 5.588885307312012, "train/global_grad_norm": 0.3960968255996704}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7376.684962157044, "train/update_time": 4255.729584518704, "train/lr": 0.00030829187271113034, "train/loss": 5.613091945648193, "train/global_grad_norm": 0.36357083916664124}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7444.285434685065, "train/update_time": 4323.2266572538065, "train/lr": 0.0002937368162738445, "train/loss": 5.598479270935059, "train/global_grad_norm": 0.35109350085258484}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7612.232262870064, "train/update_time": 4390.719008370885, "train/lr": 0.0002793894493783894, "train/loss": 5.568142890930176, "train/global_grad_norm": 0.38976845145225525}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7679.831617571064, "train/update_time": 4458.218750593718, "train/lr": 0.00026526421860705474, "train/loss": 5.5909423828125, "train/global_grad_norm": 0.3374827802181244}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7847.450622586999, "train/update_time": 4525.717398178764, "train/lr": 0.0002513753468698824, "train/loss": 5.548323631286621, "train/global_grad_norm": 0.35127025842666626}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7915.0508393970085, "train/update_time": 4593.218119100784, "train/lr": 0.00023773681908340283, "train/loss": 5.565690517425537, "train/global_grad_norm": 0.32669878005981445}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 8083.225679086056, "train/update_time": 4660.71875171666, "train/lr": 0.00022436236808900823, "train/loss": 5.576150894165039, "train/global_grad_norm": 0.33521243929862976}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 8150.834610218997, "train/update_time": 4728.229743778473, "train/lr": 0.00021126546082514682, "train/loss": 5.5722150802612305, "train/global_grad_norm": 0.3350072205066681}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8318.88102039101, "train/update_time": 4795.734360322589, "train/lr": 0.00019845928476725522, "train/loss": 5.558324337005615, "train/global_grad_norm": 0.29043614864349365}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8386.480246171006, "train/update_time": 4863.235517622321, "train/lr": 0.0001859567346490913, "train/loss": 5.551104545593262, "train/global_grad_norm": 0.31684166193008423}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8554.17625815398, "train/update_time": 4930.740559646045, "train/lr": 0.00017377039947882782, "train/loss": 5.5726318359375, "train/global_grad_norm": 0.32063278555870056}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8621.791595970979, "train/update_time": 4998.250562901143, "train/lr": 0.00016191254986299043, "train/loss": 5.5637898445129395, "train/global_grad_norm": 0.34598803520202637}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8789.583836077014, "train/update_time": 5065.748505146126, "train/lr": 0.00015039512565099468, "train/loss": 5.560715675354004, "train/global_grad_norm": 0.327943354845047}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8857.199786913, "train/update_time": 5133.253916564281, "train/lr": 0.00013922972391273224, "train/loss": 5.501072883605957, "train/global_grad_norm": 0.29274871945381165}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 9025.246275466983, "train/update_time": 5200.750415898627, "train/lr": 0.00012842758726130281, "train/loss": 5.579266548156738, "train/global_grad_norm": 0.2941227853298187}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 9092.876169198076, "train/update_time": 5268.2632564986125, "train/lr": 0.00011799959253265679, "train/loss": 5.490416526794434, "train/global_grad_norm": 0.2656475007534027}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 9260.972570382059, "train/update_time": 5335.757341830526, "train/lr": 0.00010795623983354214, "train/loss": 5.527663230895996, "train/global_grad_norm": 0.26042425632476807}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9328.580250757048, "train/update_time": 5403.266901573632, "train/lr": 9.830764196878872e-05, "train/loss": 5.501196384429932, "train/global_grad_norm": 0.258548378944397}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9497.46972536901, "train/update_time": 5470.770781763713, "train/lr": 8.906351425856951e-05, "train/loss": 5.472145080566406, "train/global_grad_norm": 0.2814105153083801}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9565.069234885043, "train/update_time": 5538.269277713727, "train/lr": 8.02331647558977e-05, "train/loss": 5.506821155548096, "train/global_grad_norm": 0.26193347573280334}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9732.95014503505, "train/update_time": 5605.777275754837, "train/lr": 7.182548487420554e-05, "train/loss": 5.532543182373047, "train/global_grad_norm": 0.22665195167064667}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9800.571015445981, "train/update_time": 5673.296408364549, "train/lr": 6.384894043444556e-05, "train/loss": 5.483277320861816, "train/global_grad_norm": 0.2179127037525177}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9968.994457261055, "train/update_time": 5740.817306284327, "train/lr": 5.6311563140726166e-05, "train/loss": 5.565979957580566, "train/global_grad_norm": 0.2223806530237198}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 10036.600267508999, "train/update_time": 5808.320603007567, "train/lr": 4.922094249306547e-05, "train/loss": 5.534444808959961, "train/global_grad_norm": 0.2236238718032837}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 10204.47946456098, "train/update_time": 5875.824670242728, "train/lr": 4.2584218145409916e-05, "train/loss": 5.472100734710693, "train/global_grad_norm": 0.21749918162822723}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 10272.080307441996, "train/update_time": 5943.3299956357805, "train/lr": 3.6408072716606236e-05, "train/loss": 5.504454135894775, "train/global_grad_norm": 0.20975066721439362}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10440.040362586034, "train/update_time": 6010.818879780243, "train/lr": 3.069872506157217e-05, "train/loss": 5.567077159881592, "train/global_grad_norm": 0.20836369693279266}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10507.649754095008, "train/update_time": 6078.3277447193395, "train/lr": 2.5461924009435368e-05, "train/loss": 5.464453220367432, "train/global_grad_norm": 0.2269560843706131}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10707.957879410009, "train/update_time": 6177.141915226472, "train/lr": 2.0702942574950812e-05, "train/loss": 5.509556770324707, "train/global_grad_norm": 0.2095288634300232}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10899.09726011497, "train/update_time": 6367.991133535514, "train/lr": 1.642657264902142e-05, "train/loss": 5.54543924331665, "train/global_grad_norm": 0.20035825669765472}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 11285.98193423904, "train/update_time": 6497.941741914721, "train/lr": 1.2637120173670358e-05, "train/loss": 5.523913383483887, "train/global_grad_norm": 0.18863900005817413}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 11471.464317596983, "train/update_time": 6683.226909449557, "train/lr": 9.338400806321978e-06, "train/loss": 5.4696879386901855, "train/global_grad_norm": 0.19305512309074402}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 11871.571419561049, "train/update_time": 6813.441040770616, "train/lr": 6.533736077758867e-06, "train/loss": 5.509196758270264, "train/global_grad_norm": 0.18883228302001953}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 12034.565373313962, "train/update_time": 6976.259414628497, "train/lr": 4.2259500476214406e-06, "train/loss": 5.492060661315918, "train/global_grad_norm": 0.18720072507858276}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 12449.846788060968, "train/update_time": 7106.677357654553, "train/lr": 2.417366460819359e-06, "train/loss": 5.52823543548584, "train/global_grad_norm": 0.1909027099609375}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 12586.090243482962, "train/update_time": 7242.765970333596, "train/lr": 1.1098064077174619e-06, "train/loss": 5.510114669799805, "train/global_grad_norm": 0.17430830001831055}
metrics/jsonlines/train_eval.jsonl CHANGED
@@ -1,19 +1,19 @@
1
- {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 464.0044742010068, "train_eval/train_update_time": 282.4727703850367, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262765104495848, "train_eval/perplexity_len_2048": 3876.7990479882474, "train_eval/loss_avg_len_1024": 8.26361274068222, "train_eval/perplexity_len_1024": 3880.086556257262, "train_eval/loss_avg_len_512": 8.264419558200608, "train_eval/perplexity_len_512": 3883.218341283336}
2
- {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 923.5610087590176, "train_eval/train_update_time": 561.0979154942906, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.399099997472659, "train_eval/perplexity_len_2048": 601.3036194924304, "train_eval/loss_avg_len_1024": 6.403366397288846, "train_eval/perplexity_len_1024": 603.8745014496265, "train_eval/loss_avg_len_512": 6.409683007578133, "train_eval/perplexity_len_512": 607.7010139099035}
3
- {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1473.913084711996, "train_eval/train_update_time": 839.7419353383593, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.693106125889135, "train_eval/perplexity_len_2048": 296.8141323485163, "train_eval/loss_avg_len_1024": 5.698990526291018, "train_eval/perplexity_len_1024": 298.5658544105799, "train_eval/loss_avg_len_512": 5.710699294427177, "train_eval/perplexity_len_512": 302.08223886516663}
4
- {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1933.6392927800189, "train_eval/train_update_time": 1118.3960400532233, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.296922603823786, "train_eval/perplexity_len_2048": 199.7212419010431, "train_eval/loss_avg_len_1024": 5.305337436088958, "train_eval/perplexity_len_1024": 201.40895359804367, "train_eval/loss_avg_len_512": 5.320490509328956, "train_eval/perplexity_len_512": 204.48415878511435}
5
- {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2484.2231900609913, "train_eval/train_update_time": 1397.0756743992679, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.045304426316234, "train_eval/perplexity_len_2048": 155.29156684287375, "train_eval/loss_avg_len_1024": 5.053415232166262, "train_eval/perplexity_len_1024": 156.55622837075202, "train_eval/loss_avg_len_512": 5.070610678311423, "train_eval/perplexity_len_512": 159.27156133906544}
6
- {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2944.979424642981, "train_eval/train_update_time": 1675.8085315313656, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.881278812076035, "train_eval/perplexity_len_2048": 131.79910244606154, "train_eval/loss_avg_len_1024": 4.889370008184379, "train_eval/perplexity_len_1024": 132.86984076618447, "train_eval/loss_avg_len_512": 4.908291251527554, "train_eval/perplexity_len_512": 135.40783867497828}
7
- {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3496.0525263110176, "train_eval/train_update_time": 1954.5468697096221, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.752543167285239, "train_eval/perplexity_len_2048": 115.87860879757943, "train_eval/loss_avg_len_1024": 4.763826194274043, "train_eval/perplexity_len_1024": 117.19347414945925, "train_eval/loss_avg_len_512": 4.785651780011176, "train_eval/perplexity_len_512": 119.77940747075029}
8
- {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3957.9235017200117, "train_eval/train_update_time": 2233.291398033558, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.653148743151705, "train_eval/perplexity_len_2048": 104.91481583709675, "train_eval/loss_avg_len_1024": 4.6641259991965125, "train_eval/perplexity_len_1024": 106.07283695212364, "train_eval/loss_avg_len_512": 4.687856853806879, "train_eval/perplexity_len_512": 108.62014133645553}
9
- {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4509.209549701016, "train_eval/train_update_time": 2512.043382478296, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.56949279251452, "train_eval/perplexity_len_2048": 96.49515429403105, "train_eval/loss_avg_len_1024": 4.584133220926888, "train_eval/perplexity_len_1024": 97.91827683495046, "train_eval/loss_avg_len_512": 4.612269650588205, "train_eval/perplexity_len_512": 100.7124725543258}
10
- {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4969.727964510967, "train_eval/train_update_time": 2790.797476610227, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.500111393837432, "train_eval/perplexity_len_2048": 90.02715921272548, "train_eval/loss_avg_len_1024": 4.5146006559921075, "train_eval/perplexity_len_1024": 91.34108222421936, "train_eval/loss_avg_len_512": 4.545015140839531, "train_eval/perplexity_len_512": 94.16185288811836}
11
- {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5521.543946499005, "train_eval/train_update_time": 3069.52293490601, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.434400985772954, "train_eval/perplexity_len_2048": 84.30161189591196, "train_eval/loss_avg_len_1024": 4.448710203694063, "train_eval/perplexity_len_1024": 85.51657387892722, "train_eval/loss_avg_len_512": 4.479653784418624, "train_eval/perplexity_len_512": 88.20412974467796}
12
- {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5981.512405745976, "train_eval/train_update_time": 3348.2465934828506, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.3785154093765595, "train_eval/perplexity_len_2048": 79.71959454765936, "train_eval/loss_avg_len_1024": 4.3939165947328, "train_eval/perplexity_len_1024": 80.95687412946403, "train_eval/loss_avg_len_512": 4.429078622167507, "train_eval/perplexity_len_512": 83.85411997858606}
13
- {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6532.972345999966, "train_eval/train_update_time": 3627.0106727198, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.332845308472933, "train_eval/perplexity_len_2048": 76.16067919713348, "train_eval/loss_avg_len_1024": 4.354710251906472, "train_eval/perplexity_len_1024": 77.84426684093609, "train_eval/loss_avg_len_512": 4.39427122400477, "train_eval/perplexity_len_512": 80.98558889804535}
14
- {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6992.639220207988, "train_eval/train_update_time": 3905.7538149688044, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.286469663051848, "train_eval/perplexity_len_2048": 72.70932644602816, "train_eval/loss_avg_len_1024": 4.309569447513205, "train_eval/perplexity_len_1024": 74.40844530144317, "train_eval/loss_avg_len_512": 4.354222116721867, "train_eval/perplexity_len_512": 77.80627758807101}
15
- {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7543.42272252898, "train_eval/train_update_time": 4184.450604122656, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.249375205130373, "train_eval/perplexity_len_2048": 70.06162452534807, "train_eval/loss_avg_len_1024": 4.273874875287056, "train_eval/perplexity_len_1024": 71.7993106682297, "train_eval/loss_avg_len_512": 4.321713214736082, "train_eval/perplexity_len_512": 75.31755296428902}
16
- {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8003.3749305050005, "train_eval/train_update_time": 4463.172267011658, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.2222635463871026, "train_eval/perplexity_len_2048": 68.18765565818079, "train_eval/loss_avg_len_1024": 4.249677161750205, "train_eval/perplexity_len_1024": 70.08278329102363, "train_eval/loss_avg_len_512": 4.299936973010299, "train_eval/perplexity_len_512": 73.69514876983682}
17
- {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8554.563296968001, "train_eval/train_update_time": 4741.9453661507, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.195244031412512, "train_eval/perplexity_len_2048": 66.36992594802435, "train_eval/loss_avg_len_1024": 4.218284104051746, "train_eval/perplexity_len_1024": 67.9168460075776, "train_eval/loss_avg_len_512": 4.2707039155407625, "train_eval/perplexity_len_512": 71.57199853357183}
18
- {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9014.539005896018, "train_eval/train_update_time": 5020.712582220614, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.188622776587208, "train_eval/perplexity_len_2048": 65.93192541236388, "train_eval/loss_avg_len_1024": 4.215066284977402, "train_eval/perplexity_len_1024": 67.69865312590402, "train_eval/loss_avg_len_512": 4.268878927308142, "train_eval/perplexity_len_512": 71.4414995941971}
19
- {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9565.561509469, "train_eval/train_update_time": 5299.418035702605, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.176184563983006, "train_eval/perplexity_len_2048": 65.11692916224894, "train_eval/loss_avg_len_1024": 4.205971465967996, "train_eval/perplexity_len_1024": 67.08573753155041, "train_eval/loss_avg_len_512": 4.259276238732308, "train_eval/perplexity_len_512": 70.75875247262296}
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 541.7261677470524, "train_eval/train_update_time": 339.9664637759561, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.431279315594583, "train_eval/perplexity_len_2048": 4588.366308362686, "train_eval/loss_avg_len_1024": 8.43578569962643, "train_eval/perplexity_len_1024": 4609.089908206309, "train_eval/loss_avg_len_512": 8.435858583673834, "train_eval/perplexity_len_512": 4609.4258495759}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1080.3595280270092, "train_eval/train_update_time": 677.5131666237721, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 7.043303485368378, "train_eval/perplexity_len_2048": 1145.164398741174, "train_eval/loss_avg_len_1024": 7.050072871062438, "train_eval/perplexity_len_1024": 1152.9427558858404, "train_eval/loss_avg_len_512": 7.055086852745153, "train_eval/perplexity_len_512": 1158.7381064947092}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1720.6480030510575, "train_eval/train_update_time": 1014.9766604538308, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.58473962082513, "train_eval/perplexity_len_2048": 723.9625183603853, "train_eval/loss_avg_len_1024": 6.593009713915525, "train_eval/perplexity_len_1024": 729.9745816768931, "train_eval/loss_avg_len_512": 6.6036262488353525, "train_eval/perplexity_len_512": 737.7656662966635}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2259.3751640570117, "train_eval/train_update_time": 1352.4327959185466, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.304370796141812, "train_eval/perplexity_len_2048": 546.9573322289459, "train_eval/loss_avg_len_1024": 6.315740904568084, "train_eval/perplexity_len_1024": 553.2117859284505, "train_eval/loss_avg_len_512": 6.329661400500045, "train_eval/perplexity_len_512": 560.9666186754808}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2900.0770314050606, "train_eval/train_update_time": 1690.0012620057678, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.118539933643442, "train_eval/perplexity_len_2048": 454.2010464652805, "train_eval/loss_avg_len_1024": 6.128848941022952, "train_eval/perplexity_len_1024": 458.90762680798815, "train_eval/loss_avg_len_512": 6.144599953355937, "train_eval/perplexity_len_512": 466.1931127637189}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3439.312252484029, "train_eval/train_update_time": 2027.637110557058, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.993265189037047, "train_eval/perplexity_len_2048": 400.7209056232889, "train_eval/loss_avg_len_1024": 6.003427277760493, "train_eval/perplexity_len_1024": 404.8138281180551, "train_eval/loss_avg_len_512": 6.020754306166491, "train_eval/perplexity_len_512": 411.8891691184047}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4079.354542599991, "train_eval/train_update_time": 2365.248016706202, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.893832671793534, "train_eval/perplexity_len_2048": 362.7930900146717, "train_eval/loss_avg_len_1024": 5.90759678772476, "train_eval/perplexity_len_1024": 367.82114011647366, "train_eval/loss_avg_len_512": 5.928100219366606, "train_eval/perplexity_len_512": 375.4405811366837}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4618.154446462984, "train_eval/train_update_time": 2702.834149704431, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.817333242833847, "train_eval/perplexity_len_2048": 336.074628131044, "train_eval/loss_avg_len_1024": 5.829878006978615, "train_eval/perplexity_len_1024": 340.31716022069065, "train_eval/loss_avg_len_512": 5.851326766426645, "train_eval/perplexity_len_512": 347.6953851520663}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5256.981477431022, "train_eval/train_update_time": 3040.4158616837813, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.752478665695435, "train_eval/perplexity_len_2048": 314.9703998548185, "train_eval/loss_avg_len_1024": 5.7688500021151405, "train_eval/perplexity_len_1024": 320.1693268063645, "train_eval/loss_avg_len_512": 5.79529527088307, "train_eval/perplexity_len_512": 328.74923973070185}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5795.387424343033, "train_eval/train_update_time": 3378.042265718919, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.702735505279561, "train_eval/perplexity_len_2048": 299.68607355137283, "train_eval/loss_avg_len_1024": 5.718074424200022, "train_eval/perplexity_len_1024": 304.3183703229844, "train_eval/loss_avg_len_512": 5.745852673280315, "train_eval/perplexity_len_512": 312.89030732338205}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6434.099346054019, "train_eval/train_update_time": 3715.6762271750486, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.655646991200555, "train_eval/perplexity_len_2048": 285.9013985897084, "train_eval/loss_avg_len_1024": 5.669753373692364, "train_eval/perplexity_len_1024": 289.96301306531365, "train_eval/loss_avg_len_512": 5.69629149336979, "train_eval/perplexity_len_512": 297.7611018557276}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6972.698619629024, "train_eval/train_update_time": 4053.2279500714503, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.61831548873808, "train_eval/perplexity_len_2048": 275.42503567083156, "train_eval/loss_avg_len_1024": 5.633299176973669, "train_eval/perplexity_len_1024": 279.58299150262565, "train_eval/loss_avg_len_512": 5.662484543039026, "train_eval/perplexity_len_512": 287.8629627292343}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7612.232262870064, "train_eval/train_update_time": 4390.719008370885, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.596533485618165, "train_eval/perplexity_len_2048": 269.49059333919547, "train_eval/loss_avg_len_1024": 5.6167390749404875, "train_eval/perplexity_len_1024": 274.9911938913255, "train_eval/loss_avg_len_512": 5.646642419948766, "train_eval/perplexity_len_512": 283.33853513529573}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8150.834610218997, "train_eval/train_update_time": 4728.229743778473, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.5670235770304135, "train_eval/perplexity_len_2048": 261.6541456709636, "train_eval/loss_avg_len_1024": 5.5871347326369865, "train_eval/perplexity_len_1024": 266.9695835538501, "train_eval/loss_avg_len_512": 5.621378581559547, "train_eval/perplexity_len_512": 276.26998153360597}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8789.583836077014, "train_eval/train_update_time": 5065.748505146126, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.54864313980328, "train_eval/perplexity_len_2048": 256.8887571941226, "train_eval/loss_avg_len_1024": 5.569592643196739, "train_eval/perplexity_len_1024": 262.3272166956895, "train_eval/loss_avg_len_512": 5.603579787398194, "train_eval/perplexity_len_512": 271.39621128174645}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9328.580250757048, "train_eval/train_update_time": 5403.266901573632, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.533965866914477, "train_eval/perplexity_len_2048": 253.1458657141699, "train_eval/loss_avg_len_1024": 5.5568862003808315, "train_eval/perplexity_len_1024": 259.0150583561873, "train_eval/loss_avg_len_512": 5.590751695272338, "train_eval/perplexity_len_512": 267.9369509736206}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9968.994457261055, "train_eval/train_update_time": 5740.817306284327, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.516171709689115, "train_eval/perplexity_len_2048": 248.68118876485585, "train_eval/loss_avg_len_1024": 5.532715844693376, "train_eval/perplexity_len_1024": 252.82962545170352, "train_eval/loss_avg_len_512": 5.567636160671682, "train_eval/perplexity_len_512": 261.8144798242977}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10507.649754095008, "train_eval/train_update_time": 6078.3277447193395, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.515744624709568, "train_eval/perplexity_len_2048": 248.5750034411309, "train_eval/loss_avg_len_1024": 5.536319641943482, "train_eval/perplexity_len_1024": 253.74241592866284, "train_eval/loss_avg_len_512": 5.571619798068714, "train_eval/perplexity_len_512": 262.8595339535372}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11871.571419561049, "train_eval/train_update_time": 6813.441040770616, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.506232261902841, "train_eval/perplexity_len_2048": 246.22167840950056, "train_eval/loss_avg_len_1024": 5.529569865919621, "train_eval/perplexity_len_1024": 252.03547865612154, "train_eval/loss_avg_len_512": 5.564332565377117, "train_eval/perplexity_len_512": 260.9509778565561}
metrics/jsonlines/val.jsonl CHANGED
@@ -1,49 +1,49 @@
1
- {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 115.56162341398885, "val/train_update_time": 115.25218190002488, "val/loss": 8.017322944736389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.36243360501248, "val/val_tokens_per_second": 453285.711394651, "val/loss_avg_len_2048": 8.017322944736389, "val/perplexity_len_2048": 3033.046820927388, "val/loss_avg_len_1024": 8.016116743054521, "val/perplexity_len_1024": 3029.3905602879668, "val/loss_avg_len_512": 8.016581874255465, "val/perplexity_len_512": 3030.799952108046}
2
- {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 317.6585605319706, "val/train_update_time": 226.74485654599266, "val/loss": 7.168872293418506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48027557897149, "val/val_tokens_per_second": 452695.34976438014, "val/loss_avg_len_2048": 7.168872293418506, "val/perplexity_len_2048": 1298.379585700498, "val/loss_avg_len_1024": 7.169298829473462, "val/perplexity_len_1024": 1298.933509532663, "val/loss_avg_len_512": 7.17260874950029, "val/perplexity_len_512": 1303.2399987050917}
3
- {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 519.8498190940008, "val/train_update_time": 338.1969051870401, "val/loss": 6.680456670384901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.21118094300618, "val/val_tokens_per_second": 454045.7133121647, "val/loss_avg_len_2048": 6.680456670384901, "val/perplexity_len_2048": 796.6828504192507, "val/loss_avg_len_1024": 6.681968356456887, "val/perplexity_len_1024": 797.8880955346282, "val/loss_avg_len_512": 6.6880630861138926, "val/perplexity_len_512": 802.7658569931743}
4
- {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 721.7491259319941, "val/train_update_time": 449.6537483881111, "val/loss": 6.256492450360163, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.13445069699083, "val/val_tokens_per_second": 454432.236323236, "val/loss_avg_len_2048": 6.256492450360163, "val/perplexity_len_2048": 521.3869384996046, "val/loss_avg_len_1024": 6.25937858268139, "val/perplexity_len_1024": 522.8939037992483, "val/loss_avg_len_512": 6.268213871597686, "val/perplexity_len_512": 527.5342919101196}
5
- {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 923.5610087590176, "val/train_update_time": 561.0979154942906, "val/loss": 5.9596897887737725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.17932575498708, "val/val_tokens_per_second": 454206.1016433674, "val/loss_avg_len_2048": 5.9596897887737725, "val/perplexity_len_2048": 387.48990187397294, "val/loss_avg_len_1024": 5.963750460020918, "val/perplexity_len_1024": 389.0665699760066, "val/loss_avg_len_512": 5.9747771193729715, "val/perplexity_len_512": 393.38041444619915}
6
- {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1125.8911167309852, "val/train_update_time": 672.5490376223461, "val/loss": 5.729621500730747, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24560184002621, "val/val_tokens_per_second": 453872.5341164848, "val/loss_avg_len_2048": 5.729621500730747, "val/perplexity_len_2048": 307.8527242916948, "val/loss_avg_len_1024": 5.73466736189276, "val/perplexity_len_1024": 309.4100320720618, "val/loss_avg_len_512": 5.747293829907757, "val/perplexity_len_512": 313.34155634560165}
7
- {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1327.8194525539875, "val/train_update_time": 784.0098267712165, "val/loss": 5.54191019657671, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24336050899001, "val/val_tokens_per_second": 453883.80673079635, "val/loss_avg_len_2048": 5.54191019657671, "val/perplexity_len_2048": 255.1649494383086, "val/loss_avg_len_1024": 5.5479404277496975, "val/perplexity_len_1024": 256.70830177953565, "val/loss_avg_len_512": 5.5618576472472405, "val/perplexity_len_512": 260.3059440825885}
8
- {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1529.7566971820197, "val/train_update_time": 895.4798201125232, "val/loss": 5.395747513790498, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28668313001981, "val/val_tokens_per_second": 453666.01784467406, "val/loss_avg_len_2048": 5.395747513790498, "val/perplexity_len_2048": 220.46688755473716, "val/loss_avg_len_1024": 5.40283216586914, "val/perplexity_len_1024": 222.03436470678773, "val/loss_avg_len_512": 5.417992734318786, "val/perplexity_len_512": 225.42617783355703}
9
- {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1731.7409806579817, "val/train_update_time": 1006.941321704362, "val/loss": 5.257520105597726, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.21052933402825, "val/val_tokens_per_second": 454048.99297658267, "val/loss_avg_len_2048": 5.257520105597726, "val/perplexity_len_2048": 192.0047489041577, "val/loss_avg_len_1024": 5.265500482419599, "val/perplexity_len_1024": 193.54314949562067, "val/loss_avg_len_512": 5.282038998350409, "val/perplexity_len_512": 196.77068168657516}
10
- {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1933.6392927800189, "val/train_update_time": 1118.3960400532233, "val/loss": 5.150704617314763, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.31206437497167, "val/val_tokens_per_second": 453538.51983646286, "val/loss_avg_len_2048": 5.150704617314763, "val/perplexity_len_2048": 172.55303134546992, "val/loss_avg_len_1024": 5.1593652144801805, "val/perplexity_len_1024": 174.0539336132167, "val/loss_avg_len_512": 5.177391785788723, "val/perplexity_len_512": 177.21998000419174}
11
- {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2136.1286090469803, "val/train_update_time": 1229.8719967252691, "val/loss": 5.0635993114376445, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34318823303329, "val/val_tokens_per_second": 453382.27265509864, "val/loss_avg_len_2048": 5.0635993114376445, "val/perplexity_len_2048": 158.15875569300152, "val/loss_avg_len_1024": 5.0730407805304045, "val/perplexity_len_1024": 159.6590781757551, "val/loss_avg_len_512": 5.092240632939898, "val/perplexity_len_512": 162.75412606608745}
12
- {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2338.156839519972, "val/train_update_time": 1341.322897736216, "val/loss": 4.98549556239089, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.20560409803875, "val/val_tokens_per_second": 454073.78410196304, "val/loss_avg_len_2048": 4.98549556239089, "val/perplexity_len_2048": 146.2760459748089, "val/loss_avg_len_1024": 4.995756369349081, "val/perplexity_len_1024": 147.78468292514813, "val/loss_avg_len_512": 5.016161771441624, "val/perplexity_len_512": 150.8312664737655}
13
- {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2540.0900395850185, "val/train_update_time": 1452.8315300212707, "val/loss": 4.916477123672562, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.74230563105084, "val/val_tokens_per_second": 451388.1338494888, "val/loss_avg_len_2048": 4.916477123672562, "val/perplexity_len_2048": 136.5208190724984, "val/loss_avg_len_1024": 4.927128413101426, "val/perplexity_len_1024": 137.98271353908035, "val/loss_avg_len_512": 4.948208645739966, "val/perplexity_len_512": 140.92229592495846}
14
- {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2742.5622889249935, "val/train_update_time": 1564.3145829213317, "val/loss": 4.863091215804801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.69826148403808, "val/val_tokens_per_second": 451607.3332586261, "val/loss_avg_len_2048": 4.863091215804801, "val/perplexity_len_2048": 129.42366084863215, "val/loss_avg_len_1024": 4.874493102245079, "val/perplexity_len_1024": 130.9077795303594, "val/loss_avg_len_512": 4.896728463353682, "val/perplexity_len_512": 133.85116361495074}
15
- {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 2944.979424642981, "val/train_update_time": 1675.8085315313656, "val/loss": 4.811523659892753, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.33663047896698, "val/val_tokens_per_second": 453415.18476867134, "val/loss_avg_len_2048": 4.811523659892753, "val/perplexity_len_2048": 122.91876129597873, "val/loss_avg_len_1024": 4.8232065941833895, "val/perplexity_len_1024": 124.36323452041226, "val/loss_avg_len_512": 4.846166890252475, "val/perplexity_len_512": 127.2516841422241}
16
- {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3147.4849796229973, "val/train_update_time": 1787.300771905575, "val/loss": 4.760587245357363, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.22133365698392, "val/val_tokens_per_second": 453994.61900804366, "val/loss_avg_len_2048": 4.760587245357363, "val/perplexity_len_2048": 116.8145045362375, "val/loss_avg_len_1024": 4.77283736684951, "val/perplexity_len_1024": 118.25429722128945, "val/loss_avg_len_512": 4.796683278769628, "val/perplexity_len_512": 121.10806894510807}
17
- {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3349.424384585989, "val/train_update_time": 1898.7890577405924, "val/loss": 4.719228506370658, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.74322453403147, "val/val_tokens_per_second": 451383.562908752, "val/loss_avg_len_2048": 4.719228506370658, "val/perplexity_len_2048": 112.08174894828639, "val/loss_avg_len_1024": 4.73204894817751, "val/perplexity_len_1024": 113.52793706523403, "val/loss_avg_len_512": 4.756577379063424, "val/perplexity_len_512": 116.3470318696743}
18
- {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 3551.9233703140053, "val/train_update_time": 2010.2989615525585, "val/loss": 4.676367494543736, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.61035839398392, "val/val_tokens_per_second": 452045.44740791514, "val/loss_avg_len_2048": 4.676367494543736, "val/perplexity_len_2048": 107.37930735283909, "val/loss_avg_len_1024": 4.689829182334012, "val/perplexity_len_1024": 108.8345873493113, "val/loss_avg_len_512": 4.7154578478252525, "val/perplexity_len_512": 111.65992272494637}
19
- {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 3754.2459046120057, "val/train_update_time": 2121.7969342375873, "val/loss": 4.640193889026716, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 91.96643112896709, "val/val_tokens_per_second": 445379.90109196096, "val/loss_avg_len_2048": 4.640193889026716, "val/perplexity_len_2048": 103.56442564245071, "val/loss_avg_len_1024": 4.654089609145093, "val/perplexity_len_1024": 105.01357307089665, "val/loss_avg_len_512": 4.680419558078237, "val/perplexity_len_512": 107.81529786259468}
20
- {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 3957.9235017200117, "val/train_update_time": 2233.291398033558, "val/loss": 4.608071265847772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.61327432200778, "val/val_tokens_per_second": 452030.90062105615, "val/loss_avg_len_2048": 4.608071265847772, "val/perplexity_len_2048": 100.29052920641857, "val/loss_avg_len_1024": 4.622682944629249, "val/perplexity_len_1024": 101.76670061160816, "val/loss_avg_len_512": 4.650229472655617, "val/perplexity_len_512": 104.60898772530528}
21
- {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4160.708882857987, "val/train_update_time": 2344.7830602055765, "val/loss": 4.577349349257373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.45394109800691, "val/val_tokens_per_second": 452827.1460899621, "val/loss_avg_len_2048": 4.577349349257373, "val/perplexity_len_2048": 97.25625986875657, "val/loss_avg_len_1024": 4.592617217212357, "val/perplexity_len_1024": 98.75254910922418, "val/loss_avg_len_512": 4.621059415361099, "val/perplexity_len_512": 101.60161344282939}
22
- {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4362.870210377965, "val/train_update_time": 2456.286583611334, "val/loss": 4.549797477854183, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.47601472598035, "val/val_tokens_per_second": 452716.66887686495, "val/loss_avg_len_2048": 4.549797477854183, "val/perplexity_len_2048": 94.61324509708179, "val/loss_avg_len_1024": 4.565505847024965, "val/perplexity_len_1024": 96.11119928630141, "val/loss_avg_len_512": 4.594841379802302, "val/perplexity_len_512": 98.97243527526068}
23
- {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 4565.060955744993, "val/train_update_time": 2567.792047406314, "val/loss": 4.5204342533537885, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.54267534095561, "val/val_tokens_per_second": 452383.3633781789, "val/loss_avg_len_2048": 4.5204342533537885, "val/perplexity_len_2048": 91.87548655482323, "val/loss_avg_len_1024": 4.536794685186399, "val/perplexity_len_1024": 93.39097238779135, "val/loss_avg_len_512": 4.567255290885735, "val/perplexity_len_512": 96.27948759639831}
24
- {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 4767.3171031199745, "val/train_update_time": 2679.296893617313, "val/loss": 4.492667575135734, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.70578284398653, "val/val_tokens_per_second": 451569.8857971491, "val/loss_avg_len_2048": 4.492667575135734, "val/perplexity_len_2048": 89.35950140608207, "val/loss_avg_len_1024": 4.509617001681402, "val/perplexity_len_1024": 90.88700227462165, "val/loss_avg_len_512": 4.5411422349753785, "val/perplexity_len_512": 93.7978781707472}
25
- {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 4969.727964510967, "val/train_update_time": 2790.797476610227, "val/loss": 4.4686831552135295, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.87361066404264, "val/val_tokens_per_second": 450735.9144276554, "val/loss_avg_len_2048": 4.4686831552135295, "val/perplexity_len_2048": 87.24176347672332, "val/loss_avg_len_1024": 4.486252108311607, "val/perplexity_len_1024": 88.78805350190629, "val/loss_avg_len_512": 4.51881691169506, "val/perplexity_len_512": 91.72701260192157}
26
- {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5172.765950003988, "val/train_update_time": 2902.285505968146, "val/loss": 4.4460520937834875, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.76087000104599, "val/val_tokens_per_second": 451295.80621613644, "val/loss_avg_len_2048": 4.4460520937834875, "val/perplexity_len_2048": 85.28956326961926, "val/loss_avg_len_1024": 4.464425405966584, "val/perplexity_len_1024": 86.87109958090244, "val/loss_avg_len_512": 4.498216118935217, "val/perplexity_len_512": 89.85669458704189}
27
- {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 5375.2188087760005, "val/train_update_time": 3013.786068893096, "val/loss": 4.420050854198402, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.47328536800342, "val/val_tokens_per_second": 452730.3262326961, "val/loss_avg_len_2048": 4.420050854198402, "val/perplexity_len_2048": 83.10051126077735, "val/loss_avg_len_1024": 4.439237378784268, "val/perplexity_len_1024": 84.71031515065614, "val/loss_avg_len_512": 4.474183511526417, "val/perplexity_len_512": 87.7229463868}
28
- {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 5577.396705480001, "val/train_update_time": 3125.264993761957, "val/loss": 4.398202258219151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.4617071620305, "val/val_tokens_per_second": 452788.27124757314, "val/loss_avg_len_2048": 4.398202258219151, "val/perplexity_len_2048": 81.30457257597935, "val/loss_avg_len_1024": 4.418109852228035, "val/perplexity_len_1024": 82.93936944356574, "val/loss_avg_len_512": 4.454481553460378, "val/perplexity_len_512": 86.01154689490046}
29
- {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 5779.562335913011, "val/train_update_time": 3236.749040101946, "val/loss": 4.376139390771115, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.2175632059807, "val/val_tokens_per_second": 454013.5927467024, "val/loss_avg_len_2048": 4.376139390771115, "val/perplexity_len_2048": 79.53040415674566, "val/loss_avg_len_1024": 4.396944615813718, "val/perplexity_len_1024": 81.2023847690807, "val/loss_avg_len_512": 4.435031853418239, "val/perplexity_len_512": 84.35481183459801}
30
- {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 5981.512405745976, "val/train_update_time": 3348.2465934828506, "val/loss": 4.355563867319981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.72774724900955, "val/val_tokens_per_second": 451460.5646228822, "val/loss_avg_len_2048": 4.355563867319981, "val/perplexity_len_2048": 77.9107442760098, "val/loss_avg_len_1024": 4.377231672070688, "val/perplexity_len_1024": 79.61732119023712, "val/loss_avg_len_512": 4.4167310255174534, "val/perplexity_len_512": 82.82508923002905}
31
- {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6184.421642497007, "val/train_update_time": 3459.7488345169113, "val/loss": 4.335396474500792, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.66217868198873, "val/val_tokens_per_second": 451787.0692659326, "val/loss_avg_len_2048": 4.335396474500792, "val/perplexity_len_2048": 76.35522578937196, "val/loss_avg_len_1024": 4.358097750052391, "val/perplexity_len_1024": 78.10841129235861, "val/loss_avg_len_512": 4.399293633644097, "val/perplexity_len_512": 81.3933548269956}
32
- {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 6386.8215373010025, "val/train_update_time": 3571.2574781817966, "val/loss": 4.31625511668981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28017372795148, "val/val_tokens_per_second": 453698.72817733017, "val/loss_avg_len_2048": 4.31625511668981, "val/perplexity_len_2048": 74.90758222363428, "val/loss_avg_len_1024": 4.339928405715012, "val/perplexity_len_1024": 76.70204771345033, "val/loss_avg_len_512": 4.382818944332655, "val/perplexity_len_512": 80.06340988951868}
33
- {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 6588.8636812510085, "val/train_update_time": 3682.7761907348176, "val/loss": 4.2996819401991555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.226537807961, "val/val_tokens_per_second": 453968.4331807084, "val/loss_avg_len_2048": 4.2996819401991555, "val/perplexity_len_2048": 73.67635648530485, "val/loss_avg_len_1024": 4.324086923091021, "val/perplexity_len_1024": 75.49654722507492, "val/loss_avg_len_512": 4.368286226595659, "val/perplexity_len_512": 78.90828483582929}
34
- {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 6790.794109267998, "val/train_update_time": 3794.261904676736, "val/loss": 4.283236857734924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.13755302602658, "val/val_tokens_per_second": 454416.59580189723, "val/loss_avg_len_2048": 4.283236857734924, "val/perplexity_len_2048": 72.47465088349, "val/loss_avg_len_1024": 4.308458720062673, "val/perplexity_len_1024": 74.32584368113129, "val/loss_avg_len_512": 4.3540999811033725, "val/perplexity_len_512": 77.7967752505338}
35
- {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 6992.639220207988, "val/train_update_time": 3905.7538149688044, "val/loss": 4.267898958559893, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.21615546499379, "val/val_tokens_per_second": 454020.67721555196, "val/loss_avg_len_2048": 4.267898958559893, "val/perplexity_len_2048": 71.371523450084, "val/loss_avg_len_1024": 4.29405301307696, "val/perplexity_len_1024": 73.26280266811683, "val/loss_avg_len_512": 4.3412228272167965, "val/perplexity_len_512": 76.80139677915308}
36
- {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 7195.047693797969, "val/train_update_time": 4017.255656591733, "val/loss": 4.25385695036254, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.40624964801827, "val/val_tokens_per_second": 453066.0231949778, "val/loss_avg_len_2048": 4.25385695036254, "val/perplexity_len_2048": 70.3763275596729, "val/loss_avg_len_1024": 4.280959435730102, "val/perplexity_len_1024": 72.30978332653642, "val/loss_avg_len_512": 4.3297005797375, "val/perplexity_len_512": 75.92155071492496}
37
- {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 7397.135954166006, "val/train_update_time": 4128.71436746855, "val/loss": 4.241492192271679, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44096185202943, "val/val_tokens_per_second": 452892.13163184514, "val/loss_avg_len_2048": 4.241492192271679, "val/perplexity_len_2048": 69.51149901038498, "val/loss_avg_len_1024": 4.26916799461185, "val/perplexity_len_1024": 71.46215398096777, "val/loss_avg_len_512": 4.318984106020722, "val/perplexity_len_512": 75.11228340295357}
38
- {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 7599.255373338994, "val/train_update_time": 4240.1828456086805, "val/loss": 4.230574601543066, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44047201605281, "val/val_tokens_per_second": 452894.58454761014, "val/loss_avg_len_2048": 4.230574601543066, "val/perplexity_len_2048": 68.75672854774076, "val/loss_avg_len_1024": 4.259278581639892, "val/perplexity_len_1024": 70.75891825403492, "val/loss_avg_len_512": 4.310739070640505, "val/perplexity_len_512": 74.49552605583897}
39
- {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 7801.41744280397, "val/train_update_time": 4351.678691691719, "val/loss": 4.220437906114012, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.25361566705396, "val/val_tokens_per_second": 453832.23372570076, "val/loss_avg_len_2048": 4.220437906114012, "val/perplexity_len_2048": 68.06328309221031, "val/loss_avg_len_1024": 4.2494930887183635, "val/perplexity_len_1024": 70.0698841278538, "val/loss_avg_len_512": 4.301655115112848, "val/perplexity_len_512": 73.82187634450761}
40
- {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 8003.3749305050005, "val/train_update_time": 4463.172267011658, "val/loss": 4.212082446529414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.65977515699342, "val/val_tokens_per_second": 451799.04681067786, "val/loss_avg_len_2048": 4.212082446529414, "val/perplexity_len_2048": 67.49695235274098, "val/loss_avg_len_1024": 4.241754451114219, "val/perplexity_len_1024": 69.52973140635773, "val/loss_avg_len_512": 4.295026430321672, "val/perplexity_len_512": 73.33415266465448}
41
- {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 8206.275687849964, "val/train_update_time": 4574.681749307667, "val/loss": 4.20452369724242, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.39738410396967, "val/val_tokens_per_second": 453110.45674607414, "val/loss_avg_len_2048": 4.20452369724242, "val/perplexity_len_2048": 66.9886831719058, "val/loss_avg_len_1024": 4.23449458040651, "val/perplexity_len_1024": 69.02678242730825, "val/loss_avg_len_512": 4.288312720157765, "val/perplexity_len_512": 72.84345745438628}
42
- {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 8408.397738418018, "val/train_update_time": 4686.193155970657, "val/loss": 4.198343608177501, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.30731243803166, "val/val_tokens_per_second": 453562.3848634241, "val/loss_avg_len_2048": 4.198343608177501, "val/perplexity_len_2048": 66.57596377846592, "val/loss_avg_len_1024": 4.228689314186201, "val/perplexity_len_1024": 68.62722447126744, "val/loss_avg_len_512": 4.283119718784839, "val/perplexity_len_512": 72.46616177619002}
43
- {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 8610.412180389976, "val/train_update_time": 4797.692331016588, "val/loss": 4.193367760937754, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.37212692195317, "val/val_tokens_per_second": 453237.0919561705, "val/loss_avg_len_2048": 4.193367760937754, "val/perplexity_len_2048": 66.24551476656741, "val/loss_avg_len_1024": 4.224034787101439, "val/perplexity_len_1024": 68.30853943562786, "val/loss_avg_len_512": 4.2790641182546505, "val/perplexity_len_512": 72.17286312516411}
44
- {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 8812.488870778994, "val/train_update_time": 4909.191867132671, "val/loss": 4.189421963141696, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32052933302475, "val/val_tokens_per_second": 453496.01361363375, "val/loss_avg_len_2048": 4.189421963141696, "val/perplexity_len_2048": 65.98463838160738, "val/loss_avg_len_1024": 4.22020412462051, "val/perplexity_len_1024": 68.04737301604953, "val/loss_avg_len_512": 4.2755115066579545, "val/perplexity_len_512": 71.9169158844202}
45
- {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 9014.539005896018, "val/train_update_time": 5020.712582220614, "val/loss": 4.186491362652555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.58312447200296, "val/val_tokens_per_second": 452181.35539870604, "val/loss_avg_len_2048": 4.186491362652555, "val/perplexity_len_2048": 65.79154684336461, "val/loss_avg_len_1024": 4.217514740810637, "val/perplexity_len_1024": 67.86461337831629, "val/loss_avg_len_512": 4.273214724269416, "val/perplexity_len_512": 71.75192792183111}
46
- {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 9217.298647498013, "val/train_update_time": 5132.212492840539, "val/loss": 4.184350719433324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.41417535103392, "val/val_tokens_per_second": 453026.30744540226, "val/loss_avg_len_2048": 4.184350719433324, "val/perplexity_len_2048": 65.65086124728785, "val/loss_avg_len_1024": 4.215543735674023, "val/perplexity_len_1024": 67.73098361249181, "val/loss_avg_len_512": 4.271550920667592, "val/perplexity_len_512": 71.63264606402603}
47
- {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 9419.395047847007, "val/train_update_time": 5243.674068749533, "val/loss": 4.183023557277815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.3052227080334, "val/val_tokens_per_second": 453572.88063424785, "val/loss_avg_len_2048": 4.183023557277815, "val/perplexity_len_2048": 65.56378970057509, "val/loss_avg_len_1024": 4.214286808201578, "val/perplexity_len_1024": 67.64590415900594, "val/loss_avg_len_512": 4.270415190260951, "val/perplexity_len_512": 71.5513368711842}
48
- {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 9621.420487532974, "val/train_update_time": 5355.161261588568, "val/loss": 4.18224084139755, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24478351400467, "val/val_tokens_per_second": 453876.649763846, "val/loss_avg_len_2048": 4.18224084139755, "val/perplexity_len_2048": 65.51249195960324, "val/loss_avg_len_1024": 4.213500716115302, "val/perplexity_len_1024": 67.59274914418302, "val/loss_avg_len_512": 4.269666781060863, "val/perplexity_len_512": 71.4978072259293}
49
- {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 9823.39123856998, "val/train_update_time": 5466.646158660587, "val/loss": 4.1819093990348515, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.3263800109853, "val/val_tokens_per_second": 453466.6394802773, "val/loss_avg_len_2048": 4.1819093990348515, "val/perplexity_len_2048": 65.49078194249032, "val/loss_avg_len_1024": 4.213224817466876, "val/perplexity_len_1024": 67.57410296839636, "val/loss_avg_len_512": 4.269427753666788, "val/perplexity_len_512": 71.48071933370453}
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 137.6325364280492, "val/train_update_time": 137.34443728171755, "val/loss": 8.097969826221465, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.75021567591466, "val/val_tokens_per_second": 406549.99818319886, "val/loss_avg_len_2048": 8.097969826221465, "val/perplexity_len_2048": 3287.7865172400466, "val/loss_avg_len_1024": 8.100544073104858, "val/perplexity_len_1024": 3296.26099445374, "val/loss_avg_len_512": 8.100835843849183, "val/perplexity_len_512": 3297.22288729685}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 373.6941461900715, "val/train_update_time": 272.43619964295067, "val/loss": 7.509516383218766, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.37394046399277, "val/val_tokens_per_second": 408074.04601888295, "val/loss_avg_len_2048": 7.509516383218766, "val/perplexity_len_2048": 1825.3305688289142, "val/loss_avg_len_1024": 7.512931452941896, "val/perplexity_len_1024": 1831.5748562589274, "val/loss_avg_len_512": 7.514602960872651, "val/perplexity_len_512": 1834.638908237129}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 609.3696456589969, "val/train_update_time": 407.4915405898355, "val/loss": 7.194160730051995, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.84995853202417, "val/val_tokens_per_second": 410215.49334808375, "val/loss_avg_len_2048": 7.194160730051995, "val/perplexity_len_2048": 1331.6322575366655, "val/loss_avg_len_1024": 7.199122499632836, "val/perplexity_len_1024": 1338.255928941527, "val/loss_avg_len_512": 7.204208867883683, "val/perplexity_len_512": 1345.0801319021539}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 844.4406513640424, "val/train_update_time": 542.4893449847586, "val/loss": 6.966771374350786, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.66927708103321, "val/val_tokens_per_second": 406876.86638525734, "val/loss_avg_len_2048": 6.966771374350786, "val/perplexity_len_2048": 1060.7923144461163, "val/loss_avg_len_1024": 6.972864169692993, "val/perplexity_len_1024": 1067.2752344159362, "val/loss_avg_len_512": 6.979901323080063, "val/perplexity_len_512": 1074.8123025952734}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1080.3595280270092, "val/train_update_time": 677.5131666237721, "val/loss": 6.773546469050646, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.50302975601517, "val/val_tokens_per_second": 407549.90271871403, "val/loss_avg_len_2048": 6.773546469050646, "val/perplexity_len_2048": 874.407460577292, "val/loss_avg_len_1024": 6.781000710988045, "val/perplexity_len_1024": 880.9498593575831, "val/loss_avg_len_512": 6.790085806655885, "val/perplexity_len_512": 888.9898398032801}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1316.5735572939739, "val/train_update_time": 812.5169301189017, "val/loss": 6.612147194027901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.81014677300118, "val/val_tokens_per_second": 406308.30636752775, "val/loss_avg_len_2048": 6.612147194027901, "val/perplexity_len_2048": 744.0789866334422, "val/loss_avg_len_1024": 6.620804714360833, "val/perplexity_len_1024": 750.5488315856857, "val/loss_avg_len_512": 6.631752208733559, "val/perplexity_len_512": 758.8106010204962}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1552.605858018971, "val/train_update_time": 947.4958640788682, "val/loss": 6.478030627672375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.42867044196464, "val/val_tokens_per_second": 407851.6604844412, "val/loss_avg_len_2048": 6.478030627672375, "val/perplexity_len_2048": 650.6882362125918, "val/loss_avg_len_1024": 6.487826892974973, "val/perplexity_len_1024": 657.0938752461975, "val/loss_avg_len_512": 6.500485957857967, "val/perplexity_len_512": 665.4649423985985}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1788.244720379007, "val/train_update_time": 1082.4689192509977, "val/loss": 6.371274175237306, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.33702367299702, "val/val_tokens_per_second": 408224.18784805224, "val/loss_avg_len_2048": 6.371274175237306, "val/perplexity_len_2048": 584.8024952309793, "val/loss_avg_len_1024": 6.382026187753306, "val/perplexity_len_1024": 591.1242237288146, "val/loss_avg_len_512": 6.395917515824736, "val/perplexity_len_512": 599.3930235893379}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 2023.8116153230658, "val/train_update_time": 1217.4519352857023, "val/loss": 6.273945900231414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.3569782380946, "val/val_tokens_per_second": 408143.0182445644, "val/loss_avg_len_2048": 6.273945900231414, "val/perplexity_len_2048": 530.5668165325196, "val/loss_avg_len_1024": 6.285613833794371, "val/perplexity_len_1024": 536.7936916369549, "val/loss_avg_len_512": 6.301188182660939, "val/perplexity_len_512": 545.2193455867996}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2259.3751640570117, "val/train_update_time": 1352.4327959185466, "val/loss": 6.199569064133987, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.38574551208876, "val/val_tokens_per_second": 408026.0577939073, "val/loss_avg_len_2048": 6.199569064133987, "val/perplexity_len_2048": 492.53674360509217, "val/loss_avg_len_1024": 6.211747290783562, "val/perplexity_len_1024": 498.57164028604257, "val/loss_avg_len_512": 6.228355393150077, "val/perplexity_len_512": 506.92111164444407}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2495.436698611011, "val/train_update_time": 1487.4421038717264, "val/loss": 6.132018646597304, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.70304475398734, "val/val_tokens_per_second": 406740.43272537884, "val/loss_avg_len_2048": 6.132018646597304, "val/perplexity_len_2048": 460.3645366385984, "val/loss_avg_len_1024": 6.144993824179471, "val/perplexity_len_1024": 466.37676879497087, "val/loss_avg_len_512": 6.16288100191094, "val/perplexity_len_512": 474.7939886785471}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2731.4342436430743, "val/train_update_time": 1622.4905407206388, "val/loss": 6.0769569249046045, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.99657162395306, "val/val_tokens_per_second": 405558.3208557709, "val/loss_avg_len_2048": 6.0769569249046045, "val/perplexity_len_2048": 435.7013035210223, "val/loss_avg_len_1024": 6.090529228214921, "val/perplexity_len_1024": 441.65508563953284, "val/loss_avg_len_512": 6.1094287188325085, "val/perplexity_len_512": 450.081518613422}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2967.7298853070242, "val/train_update_time": 1757.5320481728995, "val/loss": 6.024966034000274, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.81495576910675, "val/val_tokens_per_second": 406288.92496674176, "val/loss_avg_len_2048": 6.024966034000274, "val/perplexity_len_2048": 413.6275925097331, "val/loss_avg_len_1024": 6.039175203947909, "val/perplexity_len_1024": 419.54685155414825, "val/loss_avg_len_512": 6.058873298077286, "val/perplexity_len_512": 427.89305723099136}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3203.840811592061, "val/train_update_time": 1892.5856667858316, "val/loss": 5.973690415629559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.20035982609261, "val/val_tokens_per_second": 408780.96716508834, "val/loss_avg_len_2048": 5.973690415629559, "val/perplexity_len_2048": 392.9531586695076, "val/loss_avg_len_1024": 5.9884474193267065, "val/perplexity_len_1024": 398.7949676698734, "val/loss_avg_len_512": 6.00929208433833, "val/perplexity_len_512": 407.19495852955055}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3439.312252484029, "val/train_update_time": 2027.637110557058, "val/loss": 5.9365289360866305, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.62716725701466, "val/val_tokens_per_second": 407047.13365708606, "val/loss_avg_len_2048": 5.9365289360866305, "val/perplexity_len_2048": 378.6184372510177, "val/loss_avg_len_1024": 5.951778036109708, "val/perplexity_len_1024": 384.4362733161468, "val/loss_avg_len_512": 5.973547525303998, "val/perplexity_len_512": 392.89701347613305}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3675.6837699849857, "val/train_update_time": 2162.6913318177685, "val/loss": 5.899252319797839, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.47976837505121, "val/val_tokens_per_second": 407644.2517971631, "val/loss_avg_len_2048": 5.899252319797839, "val/perplexity_len_2048": 364.76463858479445, "val/loss_avg_len_1024": 5.914946995119518, "val/perplexity_len_1024": 370.53466203833915, "val/loss_avg_len_512": 5.937802961644158, "val/perplexity_len_512": 379.10111422273775}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3911.4375108770328, "val/train_update_time": 2297.7100352901034, "val/loss": 5.866309138357454, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.25587885000277, "val/val_tokens_per_second": 408554.5952001683, "val/loss_avg_len_2048": 5.866309138357454, "val/perplexity_len_2048": 352.9439062000189, "val/loss_avg_len_1024": 5.882450300783432, "val/perplexity_len_1024": 358.6870569906692, "val/loss_avg_len_512": 5.905991363677662, "val/perplexity_len_512": 367.2311049681368}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4147.001440979075, "val/train_update_time": 2432.7737647151807, "val/loss": 5.83839783823652, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.45364938897546, "val/val_tokens_per_second": 407750.24351176294, "val/loss_avg_len_2048": 5.83839783823652, "val/perplexity_len_2048": 343.2289915421645, "val/loss_avg_len_1024": 5.855042658005282, "val/perplexity_len_1024": 348.9897869446356, "val/loss_avg_len_512": 5.879319992480893, "val/perplexity_len_512": 357.56601144266216}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4382.723289367976, "val/train_update_time": 2567.801604798413, "val/loss": 5.807034053320264, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.15528536809143, "val/val_tokens_per_second": 408964.9372917616, "val/loss_avg_len_2048": 5.807034053320264, "val/perplexity_len_2048": 332.6310950837973, "val/loss_avg_len_1024": 5.8241930332270915, "val/perplexity_len_1024": 338.38795503388445, "val/loss_avg_len_512": 5.849660384528758, "val/perplexity_len_512": 347.11647433348577}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4618.154446462984, "val/train_update_time": 2702.834149704431, "val/loss": 5.7824100695554, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.07526794599835, "val/val_tokens_per_second": 409291.9343678644, "val/loss_avg_len_2048": 5.7824100695554, "val/perplexity_len_2048": 324.5404136083224, "val/loss_avg_len_1024": 5.799937409455178, "val/perplexity_len_1024": 330.27888692721285, "val/loss_avg_len_512": 5.826135484898323, "val/perplexity_len_512": 339.0458960851378}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4853.921031603008, "val/train_update_time": 2837.8658607284306, "val/loss": 5.758844941980299, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.82154056197032, "val/val_tokens_per_second": 410332.2766750086, "val/loss_avg_len_2048": 5.758844941980299, "val/perplexity_len_2048": 316.9819848005806, "val/loss_avg_len_1024": 5.776860027853982, "val/perplexity_len_1024": 322.7441899478597, "val/loss_avg_len_512": 5.803712826554431, "val/perplexity_len_512": 331.5281843098571}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 5089.012363151065, "val/train_update_time": 2972.8906648436096, "val/loss": 5.737493131028349, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.32674534106627, "val/val_tokens_per_second": 408266.0098337112, "val/loss_avg_len_2048": 5.737493131028349, "val/perplexity_len_2048": 310.2855898704255, "val/loss_avg_len_1024": 5.755842571069266, "val/perplexity_len_1024": 316.0317145557937, "val/loss_avg_len_512": 5.7835284965001055, "val/perplexity_len_512": 324.90359140760836}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5324.629123619059, "val/train_update_time": 3107.9427982217167, "val/loss": 5.716309683084255, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.80655148799997, "val/val_tokens_per_second": 410393.9008946195, "val/loss_avg_len_2048": 5.716309683084255, "val/perplexity_len_2048": 303.78180077501713, "val/loss_avg_len_1024": 5.734943081327493, "val/perplexity_len_1024": 309.4953541931483, "val/loss_avg_len_512": 5.763196059111692, "val/perplexity_len_512": 318.3642154880775}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5559.744059987017, "val/train_update_time": 3242.99941329984, "val/loss": 5.700755887414662, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.3600756629603, "val/val_tokens_per_second": 408130.42167839885, "val/loss_avg_len_2048": 5.700755887414662, "val/perplexity_len_2048": 299.0933964769242, "val/loss_avg_len_1024": 5.719829780167866, "val/perplexity_len_1024": 304.8530265091337, "val/loss_avg_len_512": 5.7488650611438565, "val/perplexity_len_512": 313.8342753728958}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5795.387424343033, "val/train_update_time": 3378.042265718919, "val/loss": 5.678921429768181, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.28977326198947, "val/val_tokens_per_second": 408416.5181329025, "val/loss_avg_len_2048": 5.678921429768181, "val/perplexity_len_2048": 292.633633722812, "val/loss_avg_len_1024": 5.698480519391351, "val/perplexity_len_1024": 298.4136225877653, "val/loss_avg_len_512": 5.728350201375177, "val/perplexity_len_512": 307.4615999924049}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 6031.410010787076, "val/train_update_time": 3513.0946629439713, "val/loss": 5.662745923463191, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.90603236400057, "val/val_tokens_per_second": 409985.2534506138, "val/loss_avg_len_2048": 5.662745923463191, "val/perplexity_len_2048": 287.9382143067501, "val/loss_avg_len_1024": 5.68280623494921, "val/perplexity_len_1024": 293.7726694126168, "val/loss_avg_len_512": 5.713412875633244, "val/perplexity_len_512": 302.90307675268207}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6266.654250354972, "val/train_update_time": 3648.1618733102223, "val/loss": 5.650785900356039, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.81599653395824, "val/val_tokens_per_second": 410355.06754736515, "val/loss_avg_len_2048": 5.650785900356039, "val/perplexity_len_2048": 284.5149784059353, "val/loss_avg_len_1024": 5.6710522111513475, "val/perplexity_len_1024": 290.339872575047, "val/loss_avg_len_512": 5.7019783896925045, "val/perplexity_len_512": 299.45926242581703}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6501.729716927046, "val/train_update_time": 3783.1945504179457, "val/loss": 5.63273147483218, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.2614537080517, "val/val_tokens_per_second": 408531.8782557271, "val/loss_avg_len_2048": 5.63273147483218, "val/perplexity_len_2048": 279.4243166839047, "val/loss_avg_len_1024": 5.653354023430002, "val/perplexity_len_1024": 285.24658691511917, "val/loss_avg_len_512": 5.684925929842354, "val/perplexity_len_512": 294.3960382822156}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6737.260854291031, "val/train_update_time": 3918.229618334677, "val/loss": 5.619587636593296, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.2222726480104, "val/val_tokens_per_second": 408691.59037986683, "val/loss_avg_len_2048": 5.619587636593296, "val/perplexity_len_2048": 275.7756400022235, "val/loss_avg_len_1024": 5.640656256317569, "val/perplexity_len_1024": 281.647490784806, "val/loss_avg_len_512": 5.673037551006105, "val/perplexity_len_512": 290.9168684724483}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6972.698619629024, "val/train_update_time": 4053.2279500714503, "val/loss": 5.606948659898109, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.518833424896, "val/val_tokens_per_second": 407485.8273261181, "val/loss_avg_len_2048": 5.606948659898109, "val/perplexity_len_2048": 272.31205232418614, "val/loss_avg_len_1024": 5.6283089039244, "val/perplexity_len_1024": 278.1912714547652, "val/loss_avg_len_512": 5.661133850814554, "val/perplexity_len_512": 287.47441092978147}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 7208.982709518052, "val/train_update_time": 4188.228603066411, "val/loss": 5.5957299809077465, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.08489269192796, "val/val_tokens_per_second": 409252.5744727456, "val/loss_avg_len_2048": 5.5957299809077465, "val/perplexity_len_2048": 269.27414334897264, "val/loss_avg_len_1024": 5.61745572001168, "val/perplexity_len_1024": 275.18833560693116, "val/loss_avg_len_512": 5.650942821657832, "val/perplexity_len_512": 284.5596283699001}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7444.285434685065, "val/train_update_time": 4323.2266572538065, "val/loss": 5.584819928003452, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.3505922720069, "val/val_tokens_per_second": 408168.99106061296, "val/loss_avg_len_2048": 5.584819928003452, "val/perplexity_len_2048": 266.35231582754994, "val/loss_avg_len_1024": 5.606869975394156, "val/perplexity_len_1024": 272.29062642838284, "val/loss_avg_len_512": 5.640904106772971, "val/perplexity_len_512": 281.7173058951559}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7679.831617571064, "val/train_update_time": 4458.218750593718, "val/loss": 5.574874783031576, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.00809445802588, "val/val_tokens_per_second": 409566.84778341826, "val/loss_avg_len_2048": 5.574874783031576, "val/perplexity_len_2048": 263.7165317847754, "val/loss_avg_len_1024": 5.597197585988651, "val/perplexity_len_1024": 269.6696215818559, "val/loss_avg_len_512": 5.631731116879685, "val/perplexity_len_512": 279.14493211217706}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7915.0508393970085, "val/train_update_time": 4593.218119100784, "val/loss": 5.566041899083901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.56229530100245, "val/val_tokens_per_second": 407309.71660301485, "val/loss_avg_len_2048": 5.566041899083901, "val/perplexity_len_2048": 261.39741160224924, "val/loss_avg_len_1024": 5.588593836321857, "val/perplexity_len_1024": 267.3594041821592, "val/loss_avg_len_512": 5.623624696305592, "val/perplexity_len_512": 276.8912130304534}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 8150.834610218997, "val/train_update_time": 4728.229743778473, "val/loss": 5.558274031948855, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.88718269299716, "val/val_tokens_per_second": 410062.62160672195, "val/loss_avg_len_2048": 5.558274031948855, "val/perplexity_len_2048": 259.3747771876053, "val/loss_avg_len_1024": 5.581072434914089, "val/perplexity_len_1024": 265.35603031714436, "val/loss_avg_len_512": 5.616466402893375, "val/perplexity_len_512": 274.9162217014239}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8386.480246171006, "val/train_update_time": 4863.235517622321, "val/loss": 5.550198807562748, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.08558377600275, "val/val_tokens_per_second": 409249.7486118562, "val/loss_avg_len_2048": 5.550198807562748, "val/perplexity_len_2048": 257.2887017612347, "val/loss_avg_len_1024": 5.5732162203862625, "val/perplexity_len_1024": 263.27950391543754, "val/loss_avg_len_512": 5.609014750022057, "val/perplexity_len_512": 272.87525517970744}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8621.791595970979, "val/train_update_time": 4998.250562901143, "val/loss": 5.543446067120804, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.18454685201868, "val/val_tokens_per_second": 408845.4885213135, "val/loss_avg_len_2048": 5.543446067120804, "val/perplexity_len_2048": 255.55715087625723, "val/loss_avg_len_1024": 5.5666869797617435, "val/perplexity_len_1024": 261.5660884209409, "val/loss_avg_len_512": 5.602842293289584, "val/perplexity_len_512": 271.19613196255517}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8857.199786913, "val/train_update_time": 5133.253916564281, "val/loss": 5.537739042675431, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.4387403059518, "val/val_tokens_per_second": 407810.76978095865, "val/loss_avg_len_2048": 5.537739042675431, "val/perplexity_len_2048": 254.1028338277623, "val/loss_avg_len_1024": 5.561090528252709, "val/perplexity_len_1024": 260.1063350202336, "val/loss_avg_len_512": 5.59754501928999, "val/perplexity_len_512": 269.7633300665345}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 9092.876169198076, "val/train_update_time": 5268.2632564986125, "val/loss": 5.53244903423907, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.48596478602849, "val/val_tokens_per_second": 407619.11464172014, "val/loss_avg_len_2048": 5.53244903423907, "val/perplexity_len_2048": 252.76217686286913, "val/loss_avg_len_1024": 5.556028684659931, "val/perplexity_len_1024": 258.79304407566934, "val/loss_avg_len_512": 5.5928602047552936, "val/perplexity_len_512": 268.5024945930998}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9328.580250757048, "val/train_update_time": 5403.266901573632, "val/loss": 5.527832486407514, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.72994552296586, "val/val_tokens_per_second": 406631.8093129649, "val/loss_avg_len_2048": 5.527832486407514, "val/perplexity_len_2048": 251.5979775419659, "val/loss_avg_len_1024": 5.551547197632486, "val/perplexity_len_1024": 257.6358612923782, "val/loss_avg_len_512": 5.5885610032757045, "val/perplexity_len_512": 267.3506261026088}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9565.069234885043, "val/train_update_time": 5538.269277713727, "val/loss": 5.52392855829644, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.26257929799613, "val/val_tokens_per_second": 408527.2919047938, "val/loss_avg_len_2048": 5.52392855829644, "val/perplexity_len_2048": 250.61767189118544, "val/loss_avg_len_1024": 5.5477417929501565, "val/perplexity_len_1024": 256.65731564144943, "val/loss_avg_len_512": 5.584945974971866, "val/perplexity_len_512": 266.38589084546015}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9800.571015445981, "val/train_update_time": 5673.296408364549, "val/loss": 5.520575026244429, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.78671431797557, "val/val_tokens_per_second": 406402.77121023956, "val/loss_avg_len_2048": 5.520575026244429, "val/perplexity_len_2048": 249.77862516707805, "val/loss_avg_len_1024": 5.5444952042206666, "val/perplexity_len_1024": 255.8254060580028, "val/loss_avg_len_512": 5.581906709400076, "val/perplexity_len_512": 265.5775024543716}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 10036.600267508999, "val/train_update_time": 5808.320603007567, "val/loss": 5.517816926866491, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.26595556503162, "val/val_tokens_per_second": 408513.53551838145, "val/loss_avg_len_2048": 5.517816926866491, "val/perplexity_len_2048": 249.09066007055384, "val/loss_avg_len_1024": 5.541841141779768, "val/perplexity_len_1024": 255.14732968290977, "val/loss_avg_len_512": 5.579388280792035, "val/perplexity_len_512": 264.90950597831915}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 10272.080307441996, "val/train_update_time": 5943.3299956357805, "val/loss": 5.515734384438093, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.35448634903878, "val/val_tokens_per_second": 408153.1527901874, "val/loss_avg_len_2048": 5.515734384438093, "val/perplexity_len_2048": 248.57245797864684, "val/loss_avg_len_1024": 5.53980909948076, "val/perplexity_len_1024": 254.62938593653982, "val/loss_avg_len_512": 5.577455671829229, "val/perplexity_len_512": 264.3980338896991}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10507.649754095008, "val/train_update_time": 6078.3277447193395, "val/loss": 5.514074763978267, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.78998316905927, "val/val_tokens_per_second": 406389.59063318896, "val/loss_avg_len_2048": 5.514074763978267, "val/perplexity_len_2048": 248.16026417887818, "val/loss_avg_len_1024": 5.538199598210049, "val/perplexity_len_1024": 254.2198892474354, "val/loss_avg_len_512": 5.57595400317039, "val/perplexity_len_512": 264.00129360952474}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10899.09726011497, "val/train_update_time": 6367.991133535514, "val/loss": 5.512881924894559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 256.65374484693166, "val/val_tokens_per_second": 159592.45022677752, "val/loss_avg_len_2048": 5.512881924894559, "val/perplexity_len_2048": 247.86442539585198, "val/loss_avg_len_1024": 5.5370327444368685, "val/perplexity_len_1024": 253.92342480941105, "val/loss_avg_len_512": 5.574820502644323, "val/perplexity_len_512": 263.70221753780015}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 11471.464317596983, "val/train_update_time": 6683.226909449557, "val/loss": 5.5120874294808715, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 269.61758992495015, "val/val_tokens_per_second": 151918.87150761005, "val/loss_avg_len_2048": 5.5120874294808715, "val/perplexity_len_2048": 247.66757645480368, "val/loss_avg_len_1024": 5.536254765025649, "val/perplexity_len_1024": 253.72595443678495, "val/loss_avg_len_512": 5.574072946217401, "val/perplexity_len_512": 263.5051589156827}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 12034.565373313962, "val/train_update_time": 6976.259414628497, "val/loss": 5.511645271555128, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 284.5950597979827, "val/val_tokens_per_second": 143923.79133030312, "val/loss_avg_len_2048": 5.511645271555128, "val/perplexity_len_2048": 247.55809247931202, "val/loss_avg_len_1024": 5.535829944479786, "val/perplexity_len_1024": 253.6181893303579, "val/loss_avg_len_512": 5.573694263508974, "val/perplexity_len_512": 263.40539295943375}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 12586.090243482962, "val/train_update_time": 7242.765970333596, "val/loss": 5.511438396552292, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 301.2079832049785, "val/val_tokens_per_second": 135985.77157274692, "val/loss_avg_len_2048": 5.511438396552292, "val/perplexity_len_2048": 247.5068841952679, "val/loss_avg_len_1024": 5.535628381019627, "val/perplexity_len_1024": 253.5670743221894, "val/loss_avg_len_512": 5.573483581502771, "val/perplexity_len_512": 263.34990402826514}
metrics/npz/train_eval/step-000000104857600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cc9ccb7aaac127061005cd6916da7a0fdc9141a6974c4ce9749f33bfaa724f2
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cf5d84bbb384a5ed35bff5e67720db17263aaff3e72d07276587986862d9174
3
  size 20540
metrics/npz/train_eval/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0aeceb03367da089f0498adc7839eac6c2c50c55310f643abc99139b5d11b4f7
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18da0fdb5a808db6ae8ce810f5cec11532442f666351bd5b4d8e1a75c64d0bb0
3
  size 20540
metrics/npz/train_eval/step-000000314572800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0df27fda4881a26991e0c397b6e1e3a9d65baf937e9d001e99a68387e05b459
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a5c5d61b855a58cddbd35d37b9601195794c15f09ddc3c5d20864caacb6a81
3
  size 20540
metrics/npz/train_eval/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe26f11b78bfb02f809af362b6c54ee2cbbc05dcb799bf5ce26f93239ec8ef14
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:515dfb9054b9bbab1668e9065591fb7e7bab14730c9e82620a1ef456c43603e6
3
  size 20540
metrics/npz/train_eval/step-000000524288000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19c245a7419211259996f35cf65b3624bce3fb528b539153943e31fd247051f5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d65701667108388525ef127f105b1fe131f7ad2337ccf74943aeb0e33ecb6e33
3
  size 20540
metrics/npz/train_eval/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6d594540203718747de80aeed7d78f257f057ceabb17498738985fd7bf5cccf
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58d04f720b85c446a2d4e374352e3b2ce81ac5e180e02ac5ad5e7e8ae48125bb
3
  size 20540
metrics/npz/train_eval/step-000000734003200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f47eed7277ce3e573dc8d3b1599962824dd5681d8412d943ac4821fc7ac6539
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45713c1b2c7ae72429b5954fa69c547976b917e01fb7d18f4afe19156a8060df
3
  size 20540
metrics/npz/train_eval/step-000000838860800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f766de285d162dea7f84d75b92f8f0af452d47c378929950b89467ca5aad3ee
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e61cd36e7f7c9bd5418a50d7f1110c844524ae015cd8fff2385390fbfd546e
3
  size 20540
metrics/npz/train_eval/step-000000943718400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:962ca33e9daf0eff241acee173d547c3784f67d895ab231209b622c4b8ddc50e
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd19824b71399982fd4597c3ec748e3ed162c03f0238a7afadf4912a03ec426
3
  size 20540
metrics/npz/train_eval/step-000001048576000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40fa4c61ddc1addb7fda301757febf0bd1b923a5d6cd2610a9d495d0e1328db3
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0aabd48e8e5a8bdf87e2079537c6d3ee09db8794637c4f2f848a6c71d6d3a61
3
  size 20540
metrics/npz/train_eval/step-000001153433600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02de2b7a49c30cb6207648d3c5baf02cb2dcc10b956fd850354a2557eeaab7b0
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27308144d6b352e7fad503390e18ea8cfa021701a4cca346fea659dda63b9350
3
  size 20540
metrics/npz/train_eval/step-000001258291200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b56c36fa53729b512adff50d89e7641ff6aa48e7a2418ba2326ac4daa1f78999
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d53fff2a6858900eca70ec6aab7aa4540f97d46ad666f12122dc29f8e358549
3
  size 20540
metrics/npz/train_eval/step-000001363148800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f7d719cd57293966756c88f928ac561941e0c39898b31c6d25c8c56f5c21e64
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aae4b5aefc00c7c0049e6436ccdf6865d96185ef36041a0c887ea3c19bda2df7
3
  size 20540
metrics/npz/train_eval/step-000001468006400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d98de5240d31491fe46abe75f79278c60a12664aaaacb9ffabfee6c8daf8b4e6
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:980560c5c69a512265240185c1c0622bdb2c08532e5a76a06bdbe2eec0c759af
3
  size 20540
metrics/npz/train_eval/step-000001572864000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d12803667e8453755efe60cbe4ce89f1cd74bf9b272052f150335069da474cd1
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6edc41fdebf362fbe6b1bd8131d0cdb29155c836d8edfb6b51673dcbd5a21ed0
3
  size 20540
metrics/npz/train_eval/step-000001677721600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e8657c3c7d79428f8807bcd207213906927cd6a953b56ff4e6dc390d701763b
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07474999739945278b5cab74934b30351f702800be0f68693ec4edc70a5f02bb
3
  size 20540
metrics/npz/train_eval/step-000001782579200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73db4b8038acb5e7a90551f60aa0c797607fab9282878271d42738f6ecfe28ee
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c69f954c643fb430cb55fc822dc547e5f1e5091eb54de1bf8c72d68b02871134
3
  size 20540
metrics/npz/train_eval/step-000001887436800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76b602173aeb194e597e04a7368859aadbf5fcd083d8c6cd0a2fe2e54457a3ec
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9f3a686cee7527e44101251343da7f82ac0e60f195c420effec6e59f0b89f19
3
  size 20540
metrics/npz/train_eval/step-000001992294400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35a775ddbac055759dc01b8e2b248726be2e1aa670cd5e7d41aaf344c6a78e04
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a421379bf34bc4da17cfb44894e6318dd0373d7f0796fb7a04f98ff97bd68d4
3
  size 20540
metrics/npz/val/step-000000041943040.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cb48cd9dad7a056c937dd0f7b8f793ed49e2f3591d03fad8e00ac8fe953d864
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d460803409cd699e0c8d2c573ed45b4c5ad428d905ebb57bdd6599fb3104dd3a
3
  size 21142
metrics/npz/val/step-000000083886080.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9840ddd877c006b1a3adbcf06b51cdb4ee6460ccfd7a0b72b850f9dee5bba675
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7c288497e770f3e8d3552e0ef8bf2483cfee12c60fa3c5017efbf1ff0e5088
3
  size 21142
metrics/npz/val/step-000000125829120.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f70834d868afc0c8b82a7d4af8ce24c59be911e829ecd7df4a4993d897ad2879
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1650e1457fc57890b94c5edf45edd1fe51088f8bc9855555661e17c5a1c089d0
3
  size 21142
metrics/npz/val/step-000000167772160.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a340a8edc847b8beb10b7fd1b6ba8cd5165493ab7ccfeef51f5646439ebd03e
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45feabea573f1f8d25f3e4833046698d6b89bb375f8c09f16fabf986100a830d
3
  size 21142
metrics/npz/val/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33c4ef6eef84fb214a21b6f4d5c761438d49470a2a6e81ab2194eba9f5e3de06
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:681de738dd504e8a33ad756187079d4be03cfe6ec2a26e5591b72bf9c606b821
3
  size 21142
metrics/npz/val/step-000000251658240.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f69f3a0b936ec88bc6005a43574a60bc91a8d147e74b1db3e8694db8ea5940ad
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb9faffd72382fa63f6da5f9ecbab8d65aba813c7bb323e9b6dae78b0627f8ca
3
  size 21142
metrics/npz/val/step-000000293601280.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71663e255c36887b4ddc18ece04f79564f440a87a64d87986934a0312b4d4f21
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dca3d3ff0bc4173d37c70d15d7832157eda1299d9e9513c6aadf6f2083e86b4
3
  size 21142
metrics/npz/val/step-000000335544320.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:583ea9cf5717a91776de447a997a9602c8e124e32eaa39b81bc8770c3585d73b
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a6dc5f351ea6c0c2b785ef1133891cf55c9916bdad5ee9f6050bcaae818dfeb
3
  size 21142
metrics/npz/val/step-000000377487360.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17b4c8432b92d6f010fa3f29e1682b2b024a8c3006e42ad0e4f31a4b438b30db
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80ade23d0ab226d2614f79d41c922defbdeb79a6f5acfe1a0dc05fb43c870080
3
  size 21142
metrics/npz/val/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e7adae8c8b3080833220d7ee882f2b757c56baf74c4ca9fa2a35b95fac4315a
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97c2596c96210e2712ccfdbb24164f6770b2f8fb0dc626a9b6cee9c365128123
3
  size 21142
metrics/npz/val/step-000000461373440.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c9923a7f84bfbe4751c87849440ce1b844c84f2482cdc0b2cb9c1c43828903e
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6277c7139d3759cebb5da02c17c213ab3e4f2c887282efa2288af945ea8ef32
3
  size 21142
metrics/npz/val/step-000000503316480.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a71a11a871e7e11e59d5e2d4a3290bc20c839173c4bb53d02d5bb0b3a007c7
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fdb197bff11310adfb3639299c84d085bc32638ccd6672008cc8302ccb47195
3
  size 21142
metrics/npz/val/step-000000545259520.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d37be56f811d3a807d84f3842d45c2fd1d9b6ca34d707593302d8712ed2c258
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22a003810f69405729520bb8a263994ed389e6c8c613cc97fd326067130b16fc
3
  size 21142
metrics/npz/val/step-000000587202560.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efa731ed866f7a692cc2135c56d6581695ae0317f2260695e2eca31770c100e0
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73340fd8a624f2c43731a6613020584c9fc332ae4b24dcbd039350c33bec60b6
3
  size 21142