Lanni-ni commited on
Commit
cc8cf33
·
verified ·
1 Parent(s): eb064ae

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .hydra/config.yaml +1 -1
  2. checkpoints/step-000000209715200.pt +2 -2
  3. checkpoints/step-000000419430400.pt +2 -2
  4. checkpoints/step-000000629145600.pt +2 -2
  5. checkpoints/step-000000838860800.pt +2 -2
  6. checkpoints/step-000001048576000.pt +2 -2
  7. checkpoints/step-000001258291200.pt +2 -2
  8. checkpoints/step-000001468006400.pt +2 -2
  9. checkpoints/step-000001677721600.pt +2 -2
  10. checkpoints/step-000001887436800.pt +2 -2
  11. config.yaml +1 -1
  12. decay_params.txt +13 -13
  13. logs/2025-10-26_21-16-14.log +258 -0
  14. metrics/jsonlines/checkpoint.jsonl +9 -9
  15. metrics/jsonlines/norm.jsonl +0 -0
  16. metrics/jsonlines/throughput.jsonl +0 -0
  17. metrics/jsonlines/train.jsonl +98 -98
  18. metrics/jsonlines/train_eval.jsonl +19 -19
  19. metrics/jsonlines/val.jsonl +49 -49
  20. metrics/npz/train_eval/step-000000104857600.npz +1 -1
  21. metrics/npz/train_eval/step-000000209715200.npz +1 -1
  22. metrics/npz/train_eval/step-000000314572800.npz +1 -1
  23. metrics/npz/train_eval/step-000000419430400.npz +1 -1
  24. metrics/npz/train_eval/step-000000524288000.npz +1 -1
  25. metrics/npz/train_eval/step-000000629145600.npz +1 -1
  26. metrics/npz/train_eval/step-000000734003200.npz +1 -1
  27. metrics/npz/train_eval/step-000000838860800.npz +1 -1
  28. metrics/npz/train_eval/step-000000943718400.npz +1 -1
  29. metrics/npz/train_eval/step-000001048576000.npz +1 -1
  30. metrics/npz/train_eval/step-000001153433600.npz +1 -1
  31. metrics/npz/train_eval/step-000001258291200.npz +1 -1
  32. metrics/npz/train_eval/step-000001363148800.npz +1 -1
  33. metrics/npz/train_eval/step-000001468006400.npz +1 -1
  34. metrics/npz/train_eval/step-000001572864000.npz +1 -1
  35. metrics/npz/train_eval/step-000001677721600.npz +1 -1
  36. metrics/npz/train_eval/step-000001782579200.npz +1 -1
  37. metrics/npz/train_eval/step-000001887436800.npz +1 -1
  38. metrics/npz/train_eval/step-000001992294400.npz +1 -1
  39. metrics/npz/val/step-000000041943040.npz +1 -1
  40. metrics/npz/val/step-000000083886080.npz +1 -1
  41. metrics/npz/val/step-000000125829120.npz +1 -1
  42. metrics/npz/val/step-000000167772160.npz +1 -1
  43. metrics/npz/val/step-000000209715200.npz +1 -1
  44. metrics/npz/val/step-000000251658240.npz +1 -1
  45. metrics/npz/val/step-000000293601280.npz +1 -1
  46. metrics/npz/val/step-000000335544320.npz +1 -1
  47. metrics/npz/val/step-000000377487360.npz +1 -1
  48. metrics/npz/val/step-000000419430400.npz +1 -1
  49. metrics/npz/val/step-000000461373440.npz +1 -1
  50. metrics/npz/val/step-000000503316480.npz +1 -1
.hydra/config.yaml CHANGED
@@ -81,7 +81,7 @@ train:
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
- gradient_checkpointing: true
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
 
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
+ gradient_checkpointing: false
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
checkpoints/step-000000209715200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3596ad684b7b5c3533050c96dda223ffb053d49e1f227e367f93a911c9451a95
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e45d1cf8fcb47d3de97c6d87e5f89f3999fa51cf1153d98e06ddd01738884a
3
+ size 329409794
checkpoints/step-000000419430400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd7db428d4fe8e0788e2e1127e026b822b6f87bec105e98d76bc1f4e112ef145
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76200303f1e014e549314ee020fa0e5d22b5df5ab722b78939721493230cd0e9
3
+ size 329409794
checkpoints/step-000000629145600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1048a72d60bdefe0fd931d6c2028486a3ec81f5512b21e2990f1f082539eaf01
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0edce8777efa82db6db58947a86af1e70965bcd8111d157ab482d93509e950ae
3
+ size 329409794
checkpoints/step-000000838860800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cac0b75d4411b2c94b60cda43b32d38f62140e8533b9d760cdb4b11b7c450573
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5669d373118b51004bf8aea4fe7a13ee19dbdc68f9e312defe7b150448fe71b1
3
+ size 329409794
checkpoints/step-000001048576000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:759bd71c4f858504bff61e87b38aa62710d888dc6842a39d5999c2740fae9d57
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee4dbd5ba0786230f8dc3da5b8d9004fc0a397ba0c66cb84281ac680baecca2d
3
+ size 329409794
checkpoints/step-000001258291200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:305b86ef3a38ba06a63f4734f0bebc4e55cab923fbf9b2adef54d8b0e164f548
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f97eccdcb9b8e784360b41031ee52a305bbfa2f5f86aded5ce80cd7ba2f8fa26
3
+ size 329409794
checkpoints/step-000001468006400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88d668ea1e8c49c30d9f8988770c78382d9fff490ee8ef7f1bd9c8cee51efe74
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9966465586a4d501839d48e7386d69869fbc6a1c9d5fb1b6f332d41f5b76b2b
3
+ size 329409794
checkpoints/step-000001677721600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85718ef1df320a10c05a05c1c8e367a7bdfd147dba12228dc68dfdd51d63c461
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:812207ef44485fecc552a69460100dd3edeb92b60f525c24ec4731075d854566
3
+ size 329409794
checkpoints/step-000001887436800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8e34b44d8066dd7312fdada84cbb524946011c4ee286a987d3596b7b983fe0d
3
- size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12204d8faa3d1b317437b6a40475416449528599d5a3e0eabd4f077a5f4c8544
3
+ size 329409794
config.yaml CHANGED
@@ -81,7 +81,7 @@ train:
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
- gradient_checkpointing: true
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
 
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
+ gradient_checkpointing: false
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
decay_params.txt CHANGED
@@ -1,14 +1,14 @@
1
- _forward_module._fsdp_wrapped_module.model.embeddings.weight
2
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight
3
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight
4
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight
5
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight
6
- _forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight
7
- _forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight
8
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight
9
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight
10
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight
11
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight
12
- _forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight
13
- _forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight
14
  _forward_module._fsdp_wrapped_module.lm_head.weight
 
1
+ _forward_module._fsdp_wrapped_module.emb.weight
2
+ _forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight
3
+ _forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight
4
+ _forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight
5
+ _forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight
6
+ _forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight
7
+ _forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight
8
+ _forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight
9
+ _forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight
10
+ _forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight
11
+ _forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight
12
+ _forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight
13
+ _forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight
14
  _forward_module._fsdp_wrapped_module.lm_head.weight
logs/2025-10-26_21-16-14.log ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-26 21:16:14][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256`
2
+ [2025-10-26 21:16:14][train:375][INFO] Configuration:
3
+ [2025-10-26 21:16:14][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml.
4
+ [2025-10-26 21:16:14][train:387][INFO] creating datamodule
5
+ [2025-10-26 21:16:14][train:419][INFO] creating model
6
+ [2025-10-26 21:16:15][train:440][INFO] creating optimizer
7
+ [2025-10-26 21:16:15][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-26 21:16:15][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-26 21:16:15][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-26 21:16:16][logger:288][INFO] wandb initialized. Run id: pun8f82u
11
+ [2025-10-26 21:16:16][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-26 21:16:16][logger:113][INFO] Setting up npz logger...
13
+ [2025-10-26 21:16:16][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
14
+ [2025-10-26 21:16:16][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
15
+ [2025-10-26 21:16:16][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128]
16
+ [2025-10-26 21:17:13][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:57] [ETA: 1:34:15] [loss: 10.077] [tokens/s: 392645.003] [batches/s: 0.187] [MFU: 0.000] [TFLOPS: 0.000]
17
+ [2025-10-26 21:18:06][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:50] [ETA: 1:30:15] [loss: 8.170] [tokens/s: 392713.958] [batches/s: 0.187] [MFU: 0.000] [TFLOPS: 0.000]
18
+ [2025-10-26 21:18:06][train:194][INFO] Running validation...
19
+ [2025-10-26 21:19:46][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 110.518] [val/train_update_time: 110.195] [val/loss: 8.073] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.972] [val/val_tokens_per_second: 409716.276] [val/loss_avg_len_2048: 8.073] [val/perplexity_len_2048: 3205.650] [val/loss_avg_len_1024: 8.071] [val/perplexity_len_1024: 3201.383] [val/loss_avg_len_512: 8.072] [val/perplexity_len_512: 3203.464]
20
+ [2025-10-26 21:20:40][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:04:23] [ETA: 2:22:11] [loss: 7.760] [tokens/s: 238672.825] [batches/s: 0.114] [MFU: 0.000] [TFLOPS: 0.000]
21
+ [2025-10-26 21:21:33][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:05:17] [ETA: 2:06:53] [loss: 7.535] [tokens/s: 265381.208] [batches/s: 0.127] [MFU: 0.000] [TFLOPS: 0.000]
22
+ [2025-10-26 21:21:33][train:194][INFO] Running validation...
23
+ [2025-10-26 21:23:13][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 317.249] [val/train_update_time: 216.731] [val/loss: 7.520] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.601] [val/val_tokens_per_second: 411240.277] [val/loss_avg_len_2048: 7.520] [val/perplexity_len_2048: 1844.219] [val/loss_avg_len_1024: 7.521] [val/perplexity_len_1024: 1846.058] [val/loss_avg_len_512: 7.526] [val/perplexity_len_512: 1855.284]
24
+ [2025-10-26 21:24:06][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:07:50] [ETA: 2:28:54] [loss: 7.356] [tokens/s: 222818.512] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
25
+ [2025-10-26 21:24:06][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 470.240] [train_eval/train_update_time: 270.009] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.521] [train_eval/perplexity_len_2048: 5019.001] [train_eval/loss_avg_len_1024: 8.522] [train_eval/perplexity_len_1024: 5026.594] [train_eval/loss_avg_len_512: 8.524] [train_eval/perplexity_len_512: 5034.671]
26
+ [2025-10-26 21:24:59][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:08:43] [ETA: 2:16:43] [loss: 7.169] [tokens/s: 240455.609] [batches/s: 0.115] [MFU: 0.000] [TFLOPS: 0.000]
27
+ [2025-10-26 21:24:59][train:194][INFO] Running validation...
28
+ [2025-10-26 21:26:39][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 523.628] [val/train_update_time: 323.282] [val/loss: 7.165] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.442] [val/val_tokens_per_second: 411897.959] [val/loss_avg_len_2048: 7.165] [val/perplexity_len_2048: 1292.821] [val/loss_avg_len_1024: 7.167] [val/perplexity_len_1024: 1295.904] [val/loss_avg_len_512: 7.175] [val/perplexity_len_512: 1306.548]
29
+ [2025-10-26 21:27:32][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:11:16] [ETA: 2:29:47] [loss: 7.043] [tokens/s: 216814.165] [batches/s: 0.103] [MFU: 0.000] [TFLOPS: 0.000]
30
+ [2025-10-26 21:28:26][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:12:09] [ETA: 2:19:53] [loss: 6.880] [tokens/s: 229852.701] [batches/s: 0.110] [MFU: 0.000] [TFLOPS: 0.000]
31
+ [2025-10-26 21:28:26][train:194][INFO] Running validation...
32
+ [2025-10-26 21:30:06][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 729.843] [val/train_update_time: 429.837] [val/loss: 6.866] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.848] [val/val_tokens_per_second: 406155.409] [val/loss_avg_len_2048: 6.866] [val/perplexity_len_2048: 959.019] [val/loss_avg_len_1024: 6.870] [val/perplexity_len_1024: 963.405] [val/loss_avg_len_512: 6.883] [val/perplexity_len_512: 975.280]
33
+ [2025-10-26 21:31:00][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:14:44] [ETA: 2:28:59] [loss: 6.733] [tokens/s: 213304.064] [batches/s: 0.102] [MFU: 0.000] [TFLOPS: 0.000]
34
+ [2025-10-26 21:31:53][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:15:37] [ETA: 2:20:37] [loss: 6.633] [tokens/s: 223627.540] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
35
+ [2025-10-26 21:31:53][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 937.465] [train_eval/train_update_time: 536.388] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.966] [train_eval/perplexity_len_2048: 1060.422] [train_eval/loss_avg_len_1024: 6.972] [train_eval/perplexity_len_1024: 1066.603] [train_eval/loss_avg_len_512: 6.982] [train_eval/perplexity_len_512: 1077.249]
36
+ [2025-10-26 21:31:53][train:194][INFO] Running validation...
37
+ [2025-10-26 21:33:33][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 937.465] [val/train_update_time: 536.388] [val/loss: 6.622] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.013] [val/val_tokens_per_second: 409544.725] [val/loss_avg_len_2048: 6.622] [val/perplexity_len_2048: 751.358] [val/loss_avg_len_1024: 6.628] [val/perplexity_len_1024: 756.021] [val/loss_avg_len_512: 6.644] [val/perplexity_len_512: 767.964]
38
+ [2025-10-26 21:33:33][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt...
39
+ [2025-10-26 21:33:34][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt.
40
+ [2025-10-26 21:33:34][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.443]
41
+ [2025-10-26 21:34:27][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:18:11] [ETA: 2:27:09] [loss: 6.560] [tokens/s: 201797.452] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
42
+ [2025-10-26 21:35:20][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:19:04] [ETA: 2:19:54] [loss: 6.428] [tokens/s: 223522.183] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
43
+ [2025-10-26 21:35:20][train:194][INFO] Running validation...
44
+ [2025-10-26 21:37:01][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1144.691] [val/train_update_time: 642.942] [val/loss: 6.437] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.360] [val/val_tokens_per_second: 408129.969] [val/loss_avg_len_2048: 6.437] [val/perplexity_len_2048: 624.344] [val/loss_avg_len_1024: 6.444] [val/perplexity_len_1024: 628.922] [val/loss_avg_len_512: 6.462] [val/perplexity_len_512: 640.135]
45
+ [2025-10-26 21:37:54][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:21:38] [ETA: 2:24:49] [loss: 6.382] [tokens/s: 201724.230] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
46
+ [2025-10-26 21:38:48][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:22:31] [ETA: 2:18:23] [loss: 6.308] [tokens/s: 223338.996] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
47
+ [2025-10-26 21:38:48][train:194][INFO] Running validation...
48
+ [2025-10-26 21:40:27][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1351.811] [val/train_update_time: 749.497] [val/loss: 6.287] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.596] [val/val_tokens_per_second: 411261.156] [val/loss_avg_len_2048: 6.287] [val/perplexity_len_2048: 537.394] [val/loss_avg_len_1024: 6.295] [val/perplexity_len_1024: 541.773] [val/loss_avg_len_512: 6.314] [val/perplexity_len_512: 552.194]
49
+ [2025-10-26 21:41:21][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:25:04] [ETA: 2:22:07] [loss: 6.193] [tokens/s: 201726.821] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
50
+ [2025-10-26 21:41:21][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1504.793] [train_eval/train_update_time: 802.770] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.409] [train_eval/perplexity_len_2048: 607.101] [train_eval/loss_avg_len_1024: 6.417] [train_eval/perplexity_len_1024: 612.351] [train_eval/loss_avg_len_512: 6.435] [train_eval/perplexity_len_512: 623.146]
51
+ [2025-10-26 21:42:14][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:25:58] [ETA: 2:16:20] [loss: 6.161] [tokens/s: 223306.638] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
52
+ [2025-10-26 21:42:14][train:194][INFO] Running validation...
53
+ [2025-10-26 21:43:54][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1558.167] [val/train_update_time: 856.038] [val/loss: 6.162] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.557] [val/val_tokens_per_second: 411423.777] [val/loss_avg_len_2048: 6.162] [val/perplexity_len_2048: 474.439] [val/loss_avg_len_1024: 6.171] [val/perplexity_len_1024: 478.526] [val/loss_avg_len_512: 6.191] [val/perplexity_len_512: 488.116]
54
+ [2025-10-26 21:44:47][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:28:31] [ETA: 2:19:14] [loss: 6.076] [tokens/s: 201707.332] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
55
+ [2025-10-26 21:45:40][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:29:24] [ETA: 2:13:58] [loss: 6.050] [tokens/s: 223617.598] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
56
+ [2025-10-26 21:45:40][train:194][INFO] Running validation...
57
+ [2025-10-26 21:47:20][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1764.496] [val/train_update_time: 962.595] [val/loss: 6.044] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.502] [val/val_tokens_per_second: 411648.137] [val/loss_avg_len_2048: 6.044] [val/perplexity_len_2048: 421.379] [val/loss_avg_len_1024: 6.053] [val/perplexity_len_1024: 425.264] [val/loss_avg_len_512: 6.074] [val/perplexity_len_512: 434.297]
58
+ [2025-10-26 21:48:13][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:31:57] [ETA: 2:16:14] [loss: 6.026] [tokens/s: 201973.076] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
59
+ [2025-10-26 21:49:07][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:32:50] [ETA: 2:11:23] [loss: 5.940] [tokens/s: 223854.498] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
60
+ [2025-10-26 21:49:07][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1970.751] [train_eval/train_update_time: 1069.133] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.078] [train_eval/perplexity_len_2048: 436.312] [train_eval/loss_avg_len_1024: 6.089] [train_eval/perplexity_len_1024: 440.880] [train_eval/loss_avg_len_512: 6.108] [train_eval/perplexity_len_512: 449.289]
61
+ [2025-10-26 21:49:07][train:194][INFO] Running validation...
62
+ [2025-10-26 21:50:46][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1970.751] [val/train_update_time: 1069.133] [val/loss: 5.947] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.669] [val/val_tokens_per_second: 410960.915] [val/loss_avg_len_2048: 5.947] [val/perplexity_len_2048: 382.423] [val/loss_avg_len_1024: 5.956] [val/perplexity_len_1024: 385.960] [val/loss_avg_len_512: 5.977] [val/perplexity_len_512: 394.296]
63
+ [2025-10-26 21:50:46][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt...
64
+ [2025-10-26 21:50:47][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt.
65
+ [2025-10-26 21:50:47][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.435]
66
+ [2025-10-26 21:51:40][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:35:24] [ETA: 2:13:11] [loss: 5.902] [tokens/s: 202045.969] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
67
+ [2025-10-26 21:52:33][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:36:17] [ETA: 2:08:40] [loss: 5.880] [tokens/s: 223919.256] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
68
+ [2025-10-26 21:52:33][train:194][INFO] Running validation...
69
+ [2025-10-26 21:54:13][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2177.606] [val/train_update_time: 1175.684] [val/loss: 5.860] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.702] [val/val_tokens_per_second: 410824.908] [val/loss_avg_len_2048: 5.860] [val/perplexity_len_2048: 350.594] [val/loss_avg_len_1024: 5.869] [val/perplexity_len_1024: 354.067] [val/loss_avg_len_512: 5.892] [val/perplexity_len_512: 362.086]
70
+ [2025-10-26 21:55:06][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:38:50] [ETA: 2:10:02] [loss: 5.811] [tokens/s: 202177.051] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
71
+ [2025-10-26 21:56:00][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:39:44] [ETA: 2:05:49] [loss: 5.756] [tokens/s: 223897.508] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
72
+ [2025-10-26 21:56:00][train:194][INFO] Running validation...
73
+ [2025-10-26 21:57:39][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2384.052] [val/train_update_time: 1282.224] [val/loss: 5.783] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.661] [val/val_tokens_per_second: 410995.065] [val/loss_avg_len_2048: 5.783] [val/perplexity_len_2048: 324.736] [val/loss_avg_len_1024: 5.794] [val/perplexity_len_1024: 328.183] [val/loss_avg_len_512: 5.817] [val/perplexity_len_512: 335.952]
74
+ [2025-10-26 21:58:33][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:42:17] [ETA: 2:06:51] [loss: 5.763] [tokens/s: 202168.583] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
75
+ [2025-10-26 21:58:33][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2537.099] [train_eval/train_update_time: 1335.492] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.844] [train_eval/perplexity_len_2048: 345.304] [train_eval/loss_avg_len_1024: 5.854] [train_eval/perplexity_len_1024: 348.747] [train_eval/loss_avg_len_512: 5.874] [train_eval/perplexity_len_512: 355.783]
76
+ [2025-10-26 21:59:26][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:43:10] [ETA: 2:02:52] [loss: 5.718] [tokens/s: 223871.889] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
77
+ [2025-10-26 21:59:26][train:194][INFO] Running validation...
78
+ [2025-10-26 22:01:06][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2590.474] [val/train_update_time: 1388.760] [val/loss: 5.716] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.564] [val/val_tokens_per_second: 411393.353] [val/loss_avg_len_2048: 5.716] [val/perplexity_len_2048: 303.822] [val/loss_avg_len_1024: 5.727] [val/perplexity_len_1024: 307.171] [val/loss_avg_len_512: 5.751] [val/perplexity_len_512: 314.628]
79
+ [2025-10-26 22:01:59][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:45:43] [ETA: 2:03:37] [loss: 5.693] [tokens/s: 202168.506] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
80
+ [2025-10-26 22:02:53][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:46:36] [ETA: 1:59:51] [loss: 5.643] [tokens/s: 223861.520] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
81
+ [2025-10-26 22:02:53][train:194][INFO] Running validation...
82
+ [2025-10-26 22:04:32][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 2796.791] [val/train_update_time: 1495.303] [val/loss: 5.650] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.505] [val/val_tokens_per_second: 411637.736] [val/loss_avg_len_2048: 5.650] [val/perplexity_len_2048: 284.241] [val/loss_avg_len_1024: 5.661] [val/perplexity_len_1024: 287.545] [val/loss_avg_len_512: 5.686] [val/perplexity_len_512: 294.724]
83
+ [2025-10-26 22:05:25][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:49:09] [ETA: 2:00:21] [loss: 5.611] [tokens/s: 202167.574] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
84
+ [2025-10-26 22:06:19][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:50:03] [ETA: 1:56:47] [loss: 5.585] [tokens/s: 224004.908] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
85
+ [2025-10-26 22:06:19][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3003.058] [train_eval/train_update_time: 1601.853] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.678] [train_eval/perplexity_len_2048: 292.453] [train_eval/loss_avg_len_1024: 5.688] [train_eval/perplexity_len_1024: 295.384] [train_eval/loss_avg_len_512: 5.709] [train_eval/perplexity_len_512: 301.718]
86
+ [2025-10-26 22:06:19][train:194][INFO] Running validation...
87
+ [2025-10-26 22:07:58][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3003.058] [val/train_update_time: 1601.853] [val/loss: 5.596] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.440] [val/val_tokens_per_second: 411904.901] [val/loss_avg_len_2048: 5.596] [val/perplexity_len_2048: 269.218] [val/loss_avg_len_1024: 5.607] [val/perplexity_len_1024: 272.364] [val/loss_avg_len_512: 5.632] [val/perplexity_len_512: 279.226]
88
+ [2025-10-26 22:07:58][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt...
89
+ [2025-10-26 22:07:59][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt.
90
+ [2025-10-26 22:07:59][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.449]
91
+ [2025-10-26 22:08:52][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:52:36] [ETA: 1:57:05] [loss: 5.594] [tokens/s: 202209.727] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
92
+ [2025-10-26 22:09:45][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:53:29] [ETA: 1:53:40] [loss: 5.528] [tokens/s: 223957.251] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
93
+ [2025-10-26 22:09:45][train:194][INFO] Running validation...
94
+ [2025-10-26 22:11:25][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3209.704] [val/train_update_time: 1708.403] [val/loss: 5.543] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.426] [val/val_tokens_per_second: 411964.367] [val/loss_avg_len_2048: 5.543] [val/perplexity_len_2048: 255.348] [val/loss_avg_len_1024: 5.554] [val/perplexity_len_1024: 258.396] [val/loss_avg_len_512: 5.580] [val/perplexity_len_512: 264.968]
95
+ [2025-10-26 22:12:18][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:56:02] [ETA: 1:53:46] [loss: 5.541] [tokens/s: 202263.397] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
96
+ [2025-10-26 22:13:12][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:56:55] [ETA: 1:50:30] [loss: 5.502] [tokens/s: 224011.145] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
97
+ [2025-10-26 22:13:12][train:194][INFO] Running validation...
98
+ [2025-10-26 22:14:53][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3415.885] [val/train_update_time: 1814.959] [val/loss: 5.496] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.346] [val/val_tokens_per_second: 404158.233] [val/loss_avg_len_2048: 5.496] [val/perplexity_len_2048: 243.690] [val/loss_avg_len_1024: 5.508] [val/perplexity_len_1024: 246.707] [val/loss_avg_len_512: 5.534] [val/perplexity_len_512: 253.158]
99
+ [2025-10-26 22:15:46][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:59:30] [ETA: 1:50:31] [loss: 5.483] [tokens/s: 201930.682] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
100
+ [2025-10-26 22:15:46][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3570.609] [train_eval/train_update_time: 1868.230] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.536] [train_eval/perplexity_len_2048: 253.699] [train_eval/loss_avg_len_1024: 5.548] [train_eval/perplexity_len_1024: 256.749] [train_eval/loss_avg_len_512: 5.571] [train_eval/perplexity_len_512: 262.706]
101
+ [2025-10-26 22:16:40][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:00:23] [ETA: 1:47:22] [loss: 5.410] [tokens/s: 223584.636] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
102
+ [2025-10-26 22:16:40][train:194][INFO] Running validation...
103
+ [2025-10-26 22:18:21][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 3623.981] [val/train_update_time: 1921.499] [val/loss: 5.454] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.680] [val/val_tokens_per_second: 402830.694] [val/loss_avg_len_2048: 5.454] [val/perplexity_len_2048: 233.736] [val/loss_avg_len_1024: 5.467] [val/perplexity_len_1024: 236.697] [val/loss_avg_len_512: 5.493] [val/perplexity_len_512: 242.938]
104
+ [2025-10-26 22:19:15][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:02:59] [ETA: 1:47:14] [loss: 5.437] [tokens/s: 201513.247] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
105
+ [2025-10-26 22:20:08][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:03:52] [ETA: 1:44:12] [loss: 5.413] [tokens/s: 223056.690] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
106
+ [2025-10-26 22:20:08][train:194][INFO] Running validation...
107
+ [2025-10-26 22:21:49][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 3832.435] [val/train_update_time: 2028.044] [val/loss: 5.414] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.824] [val/val_tokens_per_second: 406251.122] [val/loss_avg_len_2048: 5.414] [val/perplexity_len_2048: 224.640] [val/loss_avg_len_1024: 5.427] [val/perplexity_len_1024: 227.492] [val/loss_avg_len_512: 5.453] [val/perplexity_len_512: 233.492]
108
+ [2025-10-26 22:22:42][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:06:26] [ETA: 1:43:55] [loss: 5.414] [tokens/s: 201250.400] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
109
+ [2025-10-26 22:23:36][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:07:20] [ETA: 1:41:00] [loss: 5.335] [tokens/s: 222827.518] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
110
+ [2025-10-26 22:23:36][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4040.047] [train_eval/train_update_time: 2134.612] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.429] [train_eval/perplexity_len_2048: 227.924] [train_eval/loss_avg_len_1024: 5.441] [train_eval/perplexity_len_1024: 230.568] [train_eval/loss_avg_len_512: 5.464] [train_eval/perplexity_len_512: 235.990]
111
+ [2025-10-26 22:23:36][train:194][INFO] Running validation...
112
+ [2025-10-26 22:25:15][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4040.047] [val/train_update_time: 2134.612] [val/loss: 5.380] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.629] [val/val_tokens_per_second: 411125.218] [val/loss_avg_len_2048: 5.380] [val/perplexity_len_2048: 217.034] [val/loss_avg_len_1024: 5.393] [val/perplexity_len_1024: 219.868] [val/loss_avg_len_512: 5.419] [val/perplexity_len_512: 225.741]
113
+ [2025-10-26 22:25:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt...
114
+ [2025-10-26 22:25:16][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt.
115
+ [2025-10-26 22:25:16][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.453]
116
+ [2025-10-26 22:26:09][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:09:53] [ETA: 1:40:34] [loss: 5.345] [tokens/s: 201209.907] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
117
+ [2025-10-26 22:27:03][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:10:46] [ETA: 1:37:44] [loss: 5.346] [tokens/s: 222662.081] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
118
+ [2025-10-26 22:27:03][train:194][INFO] Running validation...
119
+ [2025-10-26 22:28:43][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4246.918] [val/train_update_time: 2241.157] [val/loss: 5.350] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.115] [val/val_tokens_per_second: 409127.671] [val/loss_avg_len_2048: 5.350] [val/perplexity_len_2048: 210.547] [val/loss_avg_len_1024: 5.363] [val/perplexity_len_1024: 213.278] [val/loss_avg_len_512: 5.389] [val/perplexity_len_512: 218.952]
120
+ [2025-10-26 22:29:36][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:13:20] [ETA: 1:37:13] [loss: 5.333] [tokens/s: 201068.550] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
121
+ [2025-10-26 22:30:30][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:14:13] [ETA: 1:34:28] [loss: 5.353] [tokens/s: 222950.672] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
122
+ [2025-10-26 22:30:30][train:194][INFO] Running validation...
123
+ [2025-10-26 22:32:11][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 4453.812] [val/train_update_time: 2347.708] [val/loss: 5.318] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.091] [val/val_tokens_per_second: 405178.307] [val/loss_avg_len_2048: 5.318] [val/perplexity_len_2048: 203.957] [val/loss_avg_len_1024: 5.331] [val/perplexity_len_1024: 206.669] [val/loss_avg_len_512: 5.358] [val/perplexity_len_512: 212.257]
124
+ [2025-10-26 22:33:04][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:16:48] [ETA: 1:33:52] [loss: 5.297] [tokens/s: 201114.718] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
125
+ [2025-10-26 22:33:04][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4608.292] [train_eval/train_update_time: 2400.978] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.341] [train_eval/perplexity_len_2048: 208.800] [train_eval/loss_avg_len_1024: 5.355] [train_eval/perplexity_len_1024: 211.619] [train_eval/loss_avg_len_512: 5.380] [train_eval/perplexity_len_512: 217.063]
126
+ [2025-10-26 22:33:57][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:17:41] [ETA: 1:31:12] [loss: 5.275] [tokens/s: 223089.747] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
127
+ [2025-10-26 22:33:57][train:194][INFO] Running validation...
128
+ [2025-10-26 22:35:39][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 4661.669] [val/train_update_time: 2454.251] [val/loss: 5.292] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.327] [val/val_tokens_per_second: 404234.005] [val/loss_avg_len_2048: 5.292] [val/perplexity_len_2048: 198.670] [val/loss_avg_len_1024: 5.305] [val/perplexity_len_1024: 201.322] [val/loss_avg_len_512: 5.332] [val/perplexity_len_512: 206.791]
129
+ [2025-10-26 22:36:32][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:20:16] [ETA: 1:30:31] [loss: 5.291] [tokens/s: 201181.011] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
130
+ [2025-10-26 22:37:26][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:21:09] [ETA: 1:27:55] [loss: 5.264] [tokens/s: 222967.653] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
131
+ [2025-10-26 22:37:26][train:194][INFO] Running validation...
132
+ [2025-10-26 22:39:07][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 4869.772] [val/train_update_time: 2560.803] [val/loss: 5.267] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.225] [val/val_tokens_per_second: 404642.453] [val/loss_avg_len_2048: 5.267] [val/perplexity_len_2048: 193.870] [val/loss_avg_len_1024: 5.281] [val/perplexity_len_1024: 196.476] [val/loss_avg_len_512: 5.308] [val/perplexity_len_512: 201.870]
133
+ [2025-10-26 22:40:00][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:23:44] [ETA: 1:27:09] [loss: 5.269] [tokens/s: 201109.121] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
134
+ [2025-10-26 22:40:54][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:24:37] [ETA: 1:24:37] [loss: 5.245] [tokens/s: 222703.660] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
135
+ [2025-10-26 22:40:54][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5077.749] [train_eval/train_update_time: 2667.341] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.274] [train_eval/perplexity_len_2048: 195.220] [train_eval/loss_avg_len_1024: 5.287] [train_eval/perplexity_len_1024: 197.736] [train_eval/loss_avg_len_512: 5.313] [train_eval/perplexity_len_512: 203.031]
136
+ [2025-10-26 22:40:54][train:194][INFO] Running validation...
137
+ [2025-10-26 22:42:35][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5077.749] [val/train_update_time: 2667.341] [val/loss: 5.244] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.128] [val/val_tokens_per_second: 405031.633] [val/loss_avg_len_2048: 5.244] [val/perplexity_len_2048: 189.484] [val/loss_avg_len_1024: 5.258] [val/perplexity_len_1024: 192.096] [val/loss_avg_len_512: 5.286] [val/perplexity_len_512: 197.462]
138
+ [2025-10-26 22:42:35][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt...
139
+ [2025-10-26 22:42:35][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt.
140
+ [2025-10-26 22:42:35][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.443]
141
+ [2025-10-26 22:43:28][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:27:12] [ETA: 1:23:47] [loss: 5.242] [tokens/s: 200822.919] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
142
+ [2025-10-26 22:44:22][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:28:06] [ETA: 1:21:19] [loss: 5.235] [tokens/s: 222358.851] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
143
+ [2025-10-26 22:44:22][train:194][INFO] Running validation...
144
+ [2025-10-26 22:46:04][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 5286.093] [val/train_update_time: 2773.886] [val/loss: 5.224] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.726] [val/val_tokens_per_second: 402649.191] [val/loss_avg_len_2048: 5.224] [val/perplexity_len_2048: 185.696] [val/loss_avg_len_1024: 5.238] [val/perplexity_len_1024: 188.252] [val/loss_avg_len_512: 5.265] [val/perplexity_len_512: 193.484]
145
+ [2025-10-26 22:46:57][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:30:41] [ETA: 1:20:25] [loss: 5.193] [tokens/s: 200512.953] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
146
+ [2025-10-26 22:47:50][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:31:34] [ETA: 1:18:00] [loss: 5.209] [tokens/s: 222204.047] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
147
+ [2025-10-26 22:47:50][train:194][INFO] Running validation...
148
+ [2025-10-26 22:49:32][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 5494.619] [val/train_update_time: 2880.433] [val/loss: 5.204] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.620] [val/val_tokens_per_second: 403068.537] [val/loss_avg_len_2048: 5.204] [val/perplexity_len_2048: 182.063] [val/loss_avg_len_1024: 5.218] [val/perplexity_len_1024: 184.597] [val/loss_avg_len_512: 5.246] [val/perplexity_len_512: 189.768]
149
+ [2025-10-26 22:50:25][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:34:09] [ETA: 1:17:02] [loss: 5.156] [tokens/s: 200408.069] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
150
+ [2025-10-26 22:50:25][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5649.622] [train_eval/train_update_time: 2933.706] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.217] [train_eval/perplexity_len_2048: 184.366] [train_eval/loss_avg_len_1024: 5.227] [train_eval/perplexity_len_1024: 186.300] [train_eval/loss_avg_len_512: 5.251] [train_eval/perplexity_len_512: 190.837]
151
+ [2025-10-26 22:51:19][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:35:03] [ETA: 1:14:40] [loss: 5.195] [tokens/s: 222130.660] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
152
+ [2025-10-26 22:51:19][train:194][INFO] Running validation...
153
+ [2025-10-26 22:52:59][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 5703.017] [val/train_update_time: 2986.992] [val/loss: 5.188] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.531] [val/val_tokens_per_second: 407436.220] [val/loss_avg_len_2048: 5.188] [val/perplexity_len_2048: 179.085] [val/loss_avg_len_1024: 5.202] [val/perplexity_len_1024: 181.607] [val/loss_avg_len_512: 5.230] [val/perplexity_len_512: 186.735]
154
+ [2025-10-26 22:53:53][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:37:36] [ETA: 1:13:38] [loss: 5.166] [tokens/s: 200559.622] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
155
+ [2025-10-26 22:54:46][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:38:30] [ETA: 1:11:19] [loss: 5.197] [tokens/s: 222297.302] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
156
+ [2025-10-26 22:54:46][train:194][INFO] Running validation...
157
+ [2025-10-26 22:56:26][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 5910.319] [val/train_update_time: 3093.534] [val/loss: 5.172] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.040] [val/val_tokens_per_second: 409434.268] [val/loss_avg_len_2048: 5.172] [val/perplexity_len_2048: 176.189] [val/loss_avg_len_1024: 5.186] [val/perplexity_len_1024: 178.683] [val/loss_avg_len_512: 5.214] [val/perplexity_len_512: 183.758]
158
+ [2025-10-26 22:57:20][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:41:03] [ETA: 1:10:13] [loss: 5.171] [tokens/s: 200785.912] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
159
+ [2025-10-26 22:58:13][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:41:57] [ETA: 1:07:58] [loss: 5.175] [tokens/s: 222658.190] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
160
+ [2025-10-26 22:58:13][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6117.131] [train_eval/train_update_time: 3200.081] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.172] [train_eval/perplexity_len_2048: 176.260] [train_eval/loss_avg_len_1024: 5.182] [train_eval/perplexity_len_1024: 178.088] [train_eval/loss_avg_len_512: 5.207] [train_eval/perplexity_len_512: 182.607]
161
+ [2025-10-26 22:58:13][train:194][INFO] Running validation...
162
+ [2025-10-26 22:59:53][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6117.131] [val/train_update_time: 3200.081] [val/loss: 5.157] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.893] [val/val_tokens_per_second: 410039.290] [val/loss_avg_len_2048: 5.157] [val/perplexity_len_2048: 173.723] [val/loss_avg_len_1024: 5.172] [val/perplexity_len_1024: 176.211] [val/loss_avg_len_512: 5.200] [val/perplexity_len_512: 181.238]
163
+ [2025-10-26 22:59:53][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt...
164
+ [2025-10-26 22:59:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt.
165
+ [2025-10-26 22:59:53][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.449]
166
+ [2025-10-26 23:00:47][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:44:30] [ETA: 1:06:49] [loss: 5.161] [tokens/s: 201017.630] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
167
+ [2025-10-26 23:01:40][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:45:24] [ETA: 1:04:36] [loss: 5.136] [tokens/s: 222982.480] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
168
+ [2025-10-26 23:01:40][train:194][INFO] Running validation...
169
+ [2025-10-26 23:03:20][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 6324.271] [val/train_update_time: 3306.647] [val/loss: 5.144] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.145] [val/val_tokens_per_second: 409008.036] [val/loss_avg_len_2048: 5.144] [val/perplexity_len_2048: 171.381] [val/loss_avg_len_1024: 5.158] [val/perplexity_len_1024: 173.834] [val/loss_avg_len_512: 5.186] [val/perplexity_len_512: 178.815]
170
+ [2025-10-26 23:04:14][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:47:57] [ETA: 1:03:24] [loss: 5.146] [tokens/s: 201329.619] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
171
+ [2025-10-26 23:05:07][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:48:51] [ETA: 1:01:13] [loss: 5.153] [tokens/s: 223342.819] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
172
+ [2025-10-26 23:05:07][train:194][INFO] Running validation...
173
+ [2025-10-26 23:06:47][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 6531.186] [val/train_update_time: 3413.191] [val/loss: 5.131] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.948] [val/val_tokens_per_second: 409812.777] [val/loss_avg_len_2048: 5.131] [val/perplexity_len_2048: 169.188] [val/loss_avg_len_1024: 5.145] [val/perplexity_len_1024: 171.645] [val/loss_avg_len_512: 5.174] [val/perplexity_len_512: 176.611]
174
+ [2025-10-26 23:07:40][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:51:24] [ETA: 0:59:59] [loss: 5.122] [tokens/s: 201658.399] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
175
+ [2025-10-26 23:07:40][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6684.524] [train_eval/train_update_time: 3466.462] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.145] [train_eval/perplexity_len_2048: 171.577] [train_eval/loss_avg_len_1024: 5.160] [train_eval/perplexity_len_1024: 174.117] [train_eval/loss_avg_len_512: 5.185] [train_eval/perplexity_len_512: 178.545]
176
+ [2025-10-26 23:08:34][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:52:17] [ETA: 0:57:51] [loss: 5.144] [tokens/s: 223482.439] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
177
+ [2025-10-26 23:08:34][train:194][INFO] Running validation...
178
+ [2025-10-26 23:10:13][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 6737.915] [val/train_update_time: 3519.732] [val/loss: 5.120] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.735] [val/val_tokens_per_second: 410687.485] [val/loss_avg_len_2048: 5.120] [val/perplexity_len_2048: 167.401] [val/loss_avg_len_1024: 5.135] [val/perplexity_len_1024: 169.836] [val/loss_avg_len_512: 5.163] [val/perplexity_len_512: 174.755]
179
+ [2025-10-26 23:11:07][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:54:51] [ETA: 0:56:34] [loss: 5.096] [tokens/s: 201810.947] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
180
+ [2025-10-26 23:12:00][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:55:44] [ETA: 0:54:27] [loss: 5.104] [tokens/s: 223547.884] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
181
+ [2025-10-26 23:12:00][train:194][INFO] Running validation...
182
+ [2025-10-26 23:13:40][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 6944.454] [val/train_update_time: 3626.288] [val/loss: 5.111] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.924] [val/val_tokens_per_second: 409912.887] [val/loss_avg_len_2048: 5.111] [val/perplexity_len_2048: 165.787] [val/loss_avg_len_1024: 5.125] [val/perplexity_len_1024: 168.201] [val/loss_avg_len_512: 5.154] [val/perplexity_len_512: 173.115]
183
+ [2025-10-26 23:14:34][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:58:17] [ETA: 0:53:08] [loss: 5.114] [tokens/s: 201831.092] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
184
+ [2025-10-26 23:15:27][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:59:11] [ETA: 0:51:04] [loss: 5.101] [tokens/s: 223651.229] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
185
+ [2025-10-26 23:15:27][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7151.144] [train_eval/train_update_time: 3732.833] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.112] [train_eval/perplexity_len_2048: 166.072] [train_eval/loss_avg_len_1024: 5.126] [train_eval/perplexity_len_1024: 168.424] [train_eval/loss_avg_len_512: 5.155] [train_eval/perplexity_len_512: 173.216]
186
+ [2025-10-26 23:15:27][train:194][INFO] Running validation...
187
+ [2025-10-26 23:17:07][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 7151.144] [val/train_update_time: 3732.833] [val/loss: 5.101] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.963] [val/val_tokens_per_second: 409749.591] [val/loss_avg_len_2048: 5.101] [val/perplexity_len_2048: 164.254] [val/loss_avg_len_1024: 5.116] [val/perplexity_len_1024: 166.669] [val/loss_avg_len_512: 5.145] [val/perplexity_len_512: 171.560]
188
+ [2025-10-26 23:17:07][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt...
189
+ [2025-10-26 23:17:07][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt.
190
+ [2025-10-26 23:17:07][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.452]
191
+ [2025-10-26 23:18:01][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:01:44] [ETA: 0:49:43] [loss: 5.092] [tokens/s: 201822.272] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
192
+ [2025-10-26 23:18:54][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:02:38] [ETA: 0:47:41] [loss: 5.101] [tokens/s: 223589.767] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
193
+ [2025-10-26 23:18:54][train:194][INFO] Running validation...
194
+ [2025-10-26 23:20:34][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 7358.337] [val/train_update_time: 3839.386] [val/loss: 5.093] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.891] [val/val_tokens_per_second: 410045.618] [val/loss_avg_len_2048: 5.093] [val/perplexity_len_2048: 162.938] [val/loss_avg_len_1024: 5.108] [val/perplexity_len_1024: 165.350] [val/loss_avg_len_512: 5.137] [val/perplexity_len_512: 170.232]
195
+ [2025-10-26 23:21:27][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:05:11] [ETA: 0:46:18] [loss: 5.109] [tokens/s: 201870.189] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
196
+ [2025-10-26 23:22:21][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:06:05] [ETA: 0:44:17] [loss: 5.105] [tokens/s: 223595.012] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
197
+ [2025-10-26 23:22:21][train:194][INFO] Running validation...
198
+ [2025-10-26 23:24:01][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 7565.031] [val/train_update_time: 3945.941] [val/loss: 5.086] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.777] [val/val_tokens_per_second: 410516.517] [val/loss_avg_len_2048: 5.086] [val/perplexity_len_2048: 161.804] [val/loss_avg_len_1024: 5.101] [val/perplexity_len_1024: 164.202] [val/loss_avg_len_512: 5.130] [val/perplexity_len_512: 169.052]
199
+ [2025-10-26 23:24:54][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:08:38] [ETA: 0:42:52] [loss: 5.097] [tokens/s: 201891.999] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
200
+ [2025-10-26 23:24:54][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7718.234] [train_eval/train_update_time: 3999.225] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.092] [train_eval/perplexity_len_2048: 162.718] [train_eval/loss_avg_len_1024: 5.107] [train_eval/perplexity_len_1024: 165.165] [train_eval/loss_avg_len_512: 5.134] [train_eval/perplexity_len_512: 169.733]
201
+ [2025-10-26 23:25:47][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:09:31] [ETA: 0:40:54] [loss: 5.053] [tokens/s: 223573.837] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
202
+ [2025-10-26 23:25:47][train:194][INFO] Running validation...
203
+ [2025-10-26 23:27:27][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 7771.637] [val/train_update_time: 4052.504] [val/loss: 5.080] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.857] [val/val_tokens_per_second: 410185.320] [val/loss_avg_len_2048: 5.080] [val/perplexity_len_2048: 160.844] [val/loss_avg_len_1024: 5.095] [val/perplexity_len_1024: 163.238] [val/loss_avg_len_512: 5.124] [val/perplexity_len_512: 168.074]
204
+ [2025-10-26 23:28:21][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:12:04] [ETA: 0:39:27] [loss: 5.113] [tokens/s: 201867.208] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
205
+ [2025-10-26 23:29:14][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:12:58] [ETA: 0:37:30] [loss: 5.037] [tokens/s: 223591.734] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
206
+ [2025-10-26 23:29:14][train:194][INFO] Running validation...
207
+ [2025-10-26 23:30:56][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 7978.290] [val/train_update_time: 4159.058] [val/loss: 5.075] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.504] [val/val_tokens_per_second: 403532.450] [val/loss_avg_len_2048: 5.075] [val/perplexity_len_2048: 159.990] [val/loss_avg_len_1024: 5.090] [val/perplexity_len_1024: 162.369] [val/loss_avg_len_512: 5.119] [val/perplexity_len_512: 167.195]
208
+ [2025-10-26 23:31:49][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:15:33] [ETA: 0:36:01] [loss: 5.071] [tokens/s: 201551.794] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
209
+ [2025-10-26 23:32:42][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:16:26] [ETA: 0:34:06] [loss: 5.039] [tokens/s: 223322.014] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
210
+ [2025-10-26 23:32:42][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8186.597] [train_eval/train_update_time: 4265.605] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.077] [train_eval/perplexity_len_2048: 160.213] [train_eval/loss_avg_len_1024: 5.092] [train_eval/perplexity_len_1024: 162.774] [train_eval/loss_avg_len_512: 5.120] [train_eval/perplexity_len_512: 167.277]
211
+ [2025-10-26 23:32:42][train:194][INFO] Running validation...
212
+ [2025-10-26 23:34:24][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 8186.597] [val/train_update_time: 4265.605] [val/loss: 5.070] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.590] [val/val_tokens_per_second: 403191.092] [val/loss_avg_len_2048: 5.070] [val/perplexity_len_2048: 159.243] [val/loss_avg_len_1024: 5.085] [val/perplexity_len_1024: 161.622] [val/loss_avg_len_512: 5.115] [val/perplexity_len_512: 166.439]
213
+ [2025-10-26 23:34:24][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt...
214
+ [2025-10-26 23:34:24][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt.
215
+ [2025-10-26 23:34:24][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.434]
216
+ [2025-10-26 23:35:18][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:19:02] [ETA: 0:32:36] [loss: 5.020] [tokens/s: 201236.804] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
217
+ [2025-10-26 23:36:11][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:19:55] [ETA: 0:30:42] [loss: 5.034] [tokens/s: 222808.847] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
218
+ [2025-10-26 23:36:11][train:194][INFO] Running validation...
219
+ [2025-10-26 23:37:53][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 8395.407] [val/train_update_time: 4372.160] [val/loss: 5.066] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.431] [val/val_tokens_per_second: 403819.955] [val/loss_avg_len_2048: 5.066] [val/perplexity_len_2048: 158.613] [val/loss_avg_len_1024: 5.081] [val/perplexity_len_1024: 160.994] [val/loss_avg_len_512: 5.111] [val/perplexity_len_512: 165.808]
220
+ [2025-10-26 23:38:46][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:22:30] [ETA: 0:29:11] [loss: 5.062] [tokens/s: 200933.563] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
221
+ [2025-10-26 23:39:39][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:23:23] [ETA: 0:27:18] [loss: 5.039] [tokens/s: 222414.750] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
222
+ [2025-10-26 23:39:39][train:194][INFO] Running validation...
223
+ [2025-10-26 23:41:19][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 8603.637] [val/train_update_time: 4478.716] [val/loss: 5.063] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.801] [val/val_tokens_per_second: 410416.185] [val/loss_avg_len_2048: 5.063] [val/perplexity_len_2048: 158.094] [val/loss_avg_len_1024: 5.078] [val/perplexity_len_1024: 160.470] [val/loss_avg_len_512: 5.108] [val/perplexity_len_512: 165.272]
224
+ [2025-10-26 23:42:13][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:25:56] [ETA: 0:25:45] [loss: 5.099] [tokens/s: 200938.684] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
225
+ [2025-10-26 23:42:13][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8756.834] [train_eval/train_update_time: 4531.993] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.061] [train_eval/perplexity_len_2048: 157.686] [train_eval/loss_avg_len_1024: 5.071] [train_eval/perplexity_len_1024: 159.400] [train_eval/loss_avg_len_512: 5.099] [train_eval/perplexity_len_512: 163.821]
226
+ [2025-10-26 23:43:06][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:26:50] [ETA: 0:23:54] [loss: 5.075] [tokens/s: 222436.658] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
227
+ [2025-10-26 23:43:06][train:194][INFO] Running validation...
228
+ [2025-10-26 23:44:46][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 8810.230] [val/train_update_time: 4585.267] [val/loss: 5.061] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.596] [val/val_tokens_per_second: 411261.912] [val/loss_avg_len_2048: 5.061] [val/perplexity_len_2048: 157.677] [val/loss_avg_len_1024: 5.075] [val/perplexity_len_1024: 160.050] [val/loss_avg_len_512: 5.105] [val/perplexity_len_512: 164.844]
229
+ [2025-10-26 23:45:39][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:29:23] [ETA: 0:22:19] [loss: 5.019] [tokens/s: 200990.678] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
230
+ [2025-10-26 23:46:32][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:30:16] [ETA: 0:20:29] [loss: 5.056] [tokens/s: 222896.705] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
231
+ [2025-10-26 23:46:32][train:194][INFO] Running validation...
232
+ [2025-10-26 23:48:12][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 9016.607] [val/train_update_time: 4691.817] [val/loss: 5.058] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.554] [val/val_tokens_per_second: 411433.791] [val/loss_avg_len_2048: 5.058] [val/perplexity_len_2048: 157.352] [val/loss_avg_len_1024: 5.073] [val/perplexity_len_1024: 159.725] [val/loss_avg_len_512: 5.103] [val/perplexity_len_512: 164.520]
233
+ [2025-10-26 23:49:05][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:32:49] [ETA: 0:18:53] [loss: 5.094] [tokens/s: 201373.463] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
234
+ [2025-10-26 23:49:59][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:33:42] [ETA: 0:17:04] [loss: 5.027] [tokens/s: 223490.653] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
235
+ [2025-10-26 23:49:59][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9222.956] [train_eval/train_update_time: 4798.368] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.061] [train_eval/perplexity_len_2048: 157.725] [train_eval/loss_avg_len_1024: 5.075] [train_eval/perplexity_len_1024: 160.002] [train_eval/loss_avg_len_512: 5.102] [train_eval/perplexity_len_512: 164.418]
236
+ [2025-10-26 23:49:59][train:194][INFO] Running validation...
237
+ [2025-10-26 23:51:39][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 9222.956] [val/train_update_time: 4798.368] [val/loss: 5.057] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.031] [val/val_tokens_per_second: 409473.595] [val/loss_avg_len_2048: 5.057] [val/perplexity_len_2048: 157.117] [val/loss_avg_len_1024: 5.072] [val/perplexity_len_1024: 159.489] [val/loss_avg_len_512: 5.102] [val/perplexity_len_512: 164.282]
238
+ [2025-10-26 23:51:39][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt...
239
+ [2025-10-26 23:51:39][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt.
240
+ [2025-10-26 23:51:39][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.442]
241
+ [2025-10-26 23:52:33][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:36:16] [ETA: 0:15:27] [loss: 5.049] [tokens/s: 201672.909] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
242
+ [2025-10-26 23:53:26][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 2:37:10] [ETA: 0:13:40] [loss: 5.089] [tokens/s: 223717.151] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
243
+ [2025-10-26 23:53:26][train:194][INFO] Running validation...
244
+ [2025-10-26 23:55:06][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 9430.235] [val/train_update_time: 4904.913] [val/loss: 5.056] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.070] [val/val_tokens_per_second: 409313.027] [val/loss_avg_len_2048: 5.056] [val/perplexity_len_2048: 156.924] [val/loss_avg_len_1024: 5.071] [val/perplexity_len_1024: 159.296] [val/loss_avg_len_512: 5.100] [val/perplexity_len_512: 164.083]
245
+ [2025-10-26 23:55:59][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 2:39:43] [ETA: 0:12:01] [loss: 5.060] [tokens/s: 201942.074] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
246
+ [2025-10-26 23:56:53][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 2:40:37] [ETA: 0:10:15] [loss: 5.021] [tokens/s: 223658.926] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
247
+ [2025-10-26 23:56:53][train:194][INFO] Running validation...
248
+ [2025-10-26 23:58:33][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 9637.073] [val/train_update_time: 5011.441] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.583] [val/val_tokens_per_second: 407224.794] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.817] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.188] [val/loss_avg_len_512: 5.100] [val/perplexity_len_512: 163.978]
249
+ [2025-10-26 23:59:27][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 2:43:11] [ETA: 0:08:35] [loss: 5.055] [tokens/s: 201792.021] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
250
+ [2025-10-26 23:59:27][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9791.052] [train_eval/train_update_time: 5064.716] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.052] [train_eval/perplexity_len_2048: 156.309] [train_eval/loss_avg_len_1024: 5.068] [train_eval/perplexity_len_1024: 158.884] [train_eval/loss_avg_len_512: 5.095] [train_eval/perplexity_len_512: 163.199]
251
+ [2025-10-27 00:00:20][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 2:44:04] [ETA: 0:06:50] [loss: 5.061] [tokens/s: 223422.133] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
252
+ [2025-10-27 00:00:20][train:194][INFO] Running validation...
253
+ [2025-10-27 00:02:00][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 9844.444] [val/train_update_time: 5117.988] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.617] [val/val_tokens_per_second: 411173.096] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.755] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.125] [val/loss_avg_len_512: 5.099] [val/perplexity_len_512: 163.911]
254
+ [2025-10-27 00:02:53][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 2:46:37] [ETA: 0:05:09] [loss: 5.073] [tokens/s: 201787.348] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
255
+ [2025-10-27 00:03:47][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 2:47:30] [ETA: 0:03:25] [loss: 5.058] [tokens/s: 223406.851] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
256
+ [2025-10-27 00:03:47][train:194][INFO] Running validation...
257
+ [2025-10-27 00:05:26][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 10050.842] [val/train_update_time: 5224.531] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.766] [val/val_tokens_per_second: 410561.508] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.726] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.097] [val/loss_avg_len_512: 5.099] [val/perplexity_len_512: 163.883]
258
+ [2025-10-27 00:05:26][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"step": 209715200, "checkpoint/checkpoint_time": 0.41898034600308165}
2
- {"step": 419430400, "checkpoint/checkpoint_time": 0.4345024300273508}
3
- {"step": 629145600, "checkpoint/checkpoint_time": 0.4318052630405873}
4
- {"step": 838860800, "checkpoint/checkpoint_time": 0.41630766697926447}
5
- {"step": 1048576000, "checkpoint/checkpoint_time": 0.42021259502507746}
6
- {"step": 1258291200, "checkpoint/checkpoint_time": 0.4185596199822612}
7
- {"step": 1468006400, "checkpoint/checkpoint_time": 0.42701432603644207}
8
- {"step": 1677721600, "checkpoint/checkpoint_time": 0.41897460201289505}
9
- {"step": 1887436800, "checkpoint/checkpoint_time": 0.42052310600411147}
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 0.44336505798855796}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 0.43483636603923514}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 0.44907815201440826}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 0.45288487296784297}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 0.442782363970764}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 0.4494084370089695}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 0.4516124309739098}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 0.43384581699501723}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 0.4421905800118111}
metrics/jsonlines/norm.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/throughput.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl CHANGED
@@ -1,98 +1,98 @@
1
- {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 72.8780845789588, "train/update_time": 72.70853926707059, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346543073654175}
2
- {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 141.98572698398493, "train/update_time": 141.70471469813492, "train/lr": 0.0009997960964140947, "train/loss": 8.126626014709473, "train/global_grad_norm": 0.962840735912323}
3
- {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 301.07908088195836, "train/update_time": 210.67360014707083, "train/lr": 0.0009990914580222257, "train/loss": 7.519838333129883, "train/global_grad_norm": 0.5704449415206909}
4
- {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 370.1664184979745, "train/update_time": 279.6480940769543, "train/lr": 0.0009978842768382998, "train/loss": 7.193228244781494, "train/global_grad_norm": 0.4210163950920105}
5
- {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 529.2283039629692, "train/update_time": 348.5810220290441, "train/lr": 0.0009961757683914405, "train/loss": 6.9471588134765625, "train/global_grad_norm": 0.26851552724838257}
6
- {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 598.2814449759899, "train/update_time": 417.52457091695396, "train/lr": 0.00099396765300483, "train/loss": 6.682523727416992, "train/global_grad_norm": 0.37517017126083374}
7
- {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 757.2756010189769, "train/update_time": 486.45154978276696, "train/lr": 0.0009912621540634887, "train/loss": 6.482426166534424, "train/global_grad_norm": 0.303166002035141}
8
- {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 826.3311064429581, "train/update_time": 555.3910710238852, "train/lr": 0.000988061995775515, "train/loss": 6.281425952911377, "train/global_grad_norm": 0.3328990936279297}
9
- {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 985.3252943049883, "train/update_time": 624.3266040377785, "train/lr": 0.0009843704004290394, "train/loss": 6.091310977935791, "train/global_grad_norm": 0.3429378867149353}
10
- {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1054.376383124967, "train/update_time": 693.2531012066174, "train/lr": 0.0009801910851476522, "train/loss": 5.976389408111572, "train/global_grad_norm": 0.5708628296852112}
11
- {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1213.9692550019827, "train/update_time": 762.1767162576434, "train/lr": 0.0009755282581475768, "train/loss": 5.850888252258301, "train/global_grad_norm": 0.38383719325065613}
12
- {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1283.0319452389958, "train/update_time": 831.1140817146515, "train/lr": 0.0009703866145003512, "train/loss": 5.717982769012451, "train/global_grad_norm": 0.5857133865356445}
13
- {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1442.136592313007, "train/update_time": 900.0497305926983, "train/lr": 0.0009647713314052896, "train/loss": 5.650483131408691, "train/global_grad_norm": 0.3780403137207031}
14
- {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1511.2121625279542, "train/update_time": 969.0007961746887, "train/lr": 0.0009586880629764817, "train/loss": 5.568375587463379, "train/global_grad_norm": 0.4291097819805145}
15
- {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1670.3140526569914, "train/update_time": 1037.9416665838216, "train/lr": 0.0009521429345495787, "train/loss": 5.447340965270996, "train/global_grad_norm": 0.3828825354576111}
16
- {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1739.393865599006, "train/update_time": 1106.8950568859, "train/lr": 0.0009451425365140996, "train/loss": 5.406825065612793, "train/global_grad_norm": 0.570035994052887}
17
- {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1898.5316135869944, "train/update_time": 1175.8391638009343, "train/lr": 0.000937693917677468, "train/loss": 5.300189971923828, "train/global_grad_norm": 0.39234593510627747}
18
- {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1967.6012063919916, "train/update_time": 1244.7901415458764, "train/lr": 0.0009298045781674596, "train/loss": 5.269626617431641, "train/global_grad_norm": 0.5113462209701538}
19
- {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2127.1002123509534, "train/update_time": 1313.7616025957977, "train/lr": 0.0009214824618802108, "train/loss": 5.244931221008301, "train/global_grad_norm": 0.5057875514030457}
20
- {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2196.1893687059637, "train/update_time": 1382.727176492801, "train/lr": 0.000912735948481387, "train/loss": 5.148478984832764, "train/global_grad_norm": 0.4193888008594513}
21
- {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2356.148066883965, "train/update_time": 1451.6794419176877, "train/lr": 0.0009035738449685707, "train/loss": 5.105681896209717, "train/global_grad_norm": 0.4414325952529907}
22
- {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2425.217230288952, "train/update_time": 1520.627368493646, "train/lr": 0.0008940053768033609, "train/loss": 5.069815635681152, "train/global_grad_norm": 0.45171600580215454}
23
- {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2584.4522528109956, "train/update_time": 1589.588658401568, "train/lr": 0.0008840401786221159, "train/loss": 5.012455940246582, "train/global_grad_norm": 0.4408389925956726}
24
- {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2653.526103938988, "train/update_time": 1658.537395758729, "train/lr": 0.0008736882845346905, "train/loss": 4.963330268859863, "train/global_grad_norm": 0.5382868647575378}
25
- {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2812.5629005369847, "train/update_time": 1727.488953433698, "train/lr": 0.0008629601180209381, "train/loss": 4.960586071014404, "train/global_grad_norm": 0.4774056375026703}
26
- {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2881.6501679039793, "train/update_time": 1796.4439407095779, "train/lr": 0.0008518664814351503, "train/loss": 4.907783031463623, "train/global_grad_norm": 0.42411527037620544}
27
- {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3040.8488886230043, "train/update_time": 1865.3935032716836, "train/lr": 0.0008404185451290017, "train/loss": 4.902004718780518, "train/global_grad_norm": 0.6527204513549805}
28
- {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3109.932256244996, "train/update_time": 1934.345054808713, "train/lr": 0.0008286278362039527, "train/loss": 4.846382141113281, "train/global_grad_norm": 0.5287019610404968}
29
- {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3268.990683123993, "train/update_time": 2003.3044974627555, "train/lr": 0.0008165062269044352, "train/loss": 4.817776203155518, "train/global_grad_norm": 0.5458475351333618}
30
- {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3338.075166533992, "train/update_time": 2072.2588375147316, "train/lr": 0.0008040659226635089, "train/loss": 4.793833255767822, "train/global_grad_norm": 0.47956281900405884}
31
- {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3497.58566847397, "train/update_time": 2141.20914719766, "train/lr": 0.0007913194498130252, "train/loss": 4.808987140655518, "train/global_grad_norm": 0.4503716826438904}
32
- {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3566.6879564279807, "train/update_time": 2210.160061070579, "train/lr": 0.000778279642970672, "train/loss": 4.740894317626953, "train/global_grad_norm": 0.4691788852214813}
33
- {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3725.7928462160053, "train/update_time": 2279.117056379677, "train/lr": 0.0007649596321166025, "train/loss": 4.758164882659912, "train/global_grad_norm": 0.48899734020233154}
34
- {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3794.8868216549745, "train/update_time": 2348.075175291684, "train/lr": 0.0007513728293726579, "train/loss": 4.721774578094482, "train/global_grad_norm": 0.46350735425949097}
35
- {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3954.2480487469584, "train/update_time": 2417.024196324637, "train/lr": 0.0007375329154974975, "train/loss": 4.702852725982666, "train/global_grad_norm": 0.45745885372161865}
36
- {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4023.3350287670037, "train/update_time": 2485.9809508775943, "train/lr": 0.0007234538261112341, "train/loss": 4.632237434387207, "train/global_grad_norm": 0.6149536967277527}
37
- {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4182.707546444959, "train/update_time": 2554.9301924086176, "train/lr": 0.0007091497376634464, "train/loss": 4.653685092926025, "train/global_grad_norm": 0.45167258381843567}
38
- {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4251.801609379996, "train/update_time": 2623.8855139956577, "train/lr": 0.0006946350531586958, "train/loss": 4.6325860023498535, "train/global_grad_norm": 0.473895788192749}
39
- {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4410.926594229008, "train/update_time": 2692.833996849775, "train/lr": 0.0006799243876539214, "train/loss": 4.6405487060546875, "train/global_grad_norm": 0.6041337847709656}
40
- {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4480.02484513697, "train/update_time": 2761.798597707704, "train/lr": 0.0006650325535423166, "train/loss": 4.547246932983398, "train/global_grad_norm": 0.5259911417961121}
41
- {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4639.57663258299, "train/update_time": 2830.754078882688, "train/lr": 0.0006499745456385053, "train/loss": 4.569416522979736, "train/global_grad_norm": 0.5999050140380859}
42
- {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4708.671424973989, "train/update_time": 2899.7122086867457, "train/lr": 0.0006347655260800339, "train/loss": 4.56511926651001, "train/global_grad_norm": 0.47612109780311584}
43
- {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4867.7282030819915, "train/update_time": 2968.665082120744, "train/lr": 0.0006194208090603844, "train/loss": 4.56137228012085, "train/global_grad_norm": 0.555321216583252}
44
- {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4936.824467270984, "train/update_time": 3037.629162015859, "train/lr": 0.0006039558454088796, "train/loss": 4.581612586975098, "train/global_grad_norm": 0.4930824935436249}
45
- {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5096.086593801971, "train/update_time": 3106.58031973982, "train/lr": 0.0005883862070330078, "train/loss": 4.533069610595703, "train/global_grad_norm": 0.6734046339988708}
46
- {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5165.18791128695, "train/update_time": 3175.5471031158813, "train/lr": 0.0005727275712388317, "train/loss": 4.493007183074951, "train/global_grad_norm": 0.4193324148654938}
47
- {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5324.26284656598, "train/update_time": 3244.498165418743, "train/lr": 0.0005569957049452703, "train/loss": 4.517164707183838, "train/global_grad_norm": 0.4578356444835663}
48
- {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5393.351182107988, "train/update_time": 3313.452428144694, "train/lr": 0.0005412064488081482, "train/loss": 4.49570369720459, "train/global_grad_norm": 0.5219614505767822}
49
- {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5552.3678305439535, "train/update_time": 3382.4195148196886, "train/lr": 0.0005253757012699972, "train/loss": 4.4889116287231445, "train/global_grad_norm": 0.3808702528476715}
50
- {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5621.47650762595, "train/update_time": 3451.396044731722, "train/lr": 0.0005095194025516734, "train/loss": 4.463628768920898, "train/global_grad_norm": 0.52295982837677}
51
- {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5780.910811659007, "train/update_time": 3520.3645316287293, "train/lr": 0.0004936535186019053, "train/loss": 4.463111877441406, "train/global_grad_norm": 0.46243464946746826}
52
- {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5850.004313221958, "train/update_time": 3589.3208626466803, "train/lr": 0.00047779402502093696, "train/loss": 4.4545392990112305, "train/global_grad_norm": 0.5457447171211243}
53
- {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6009.048461712955, "train/update_time": 3658.2837809736375, "train/lr": 0.0004619568909744525, "train/loss": 4.413477420806885, "train/global_grad_norm": 0.4806564152240753}
54
- {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6078.168775815982, "train/update_time": 3727.2589657856734, "train/lr": 0.00044615806311398067, "train/loss": 4.425002098083496, "train/global_grad_norm": 0.5740962028503418}
55
- {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6237.341601410008, "train/update_time": 3796.2288036436657, "train/lr": 0.0004304134495199673, "train/loss": 4.370032787322998, "train/global_grad_norm": 0.46423137187957764}
56
- {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6306.462939933001, "train/update_time": 3865.200992291735, "train/lr": 0.0004147389036836882, "train/loss": 4.4128217697143555, "train/global_grad_norm": 0.5103574991226196}
57
- {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6465.5906408529845, "train/update_time": 3934.16611681378, "train/lr": 0.0003991502085441259, "train/loss": 4.360372066497803, "train/global_grad_norm": 0.38778555393218994}
58
- {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6534.709841756965, "train/update_time": 4003.142038117745, "train/lr": 0.0003836630605958888, "train/loss": 4.410104751586914, "train/global_grad_norm": 0.5044620633125305}
59
- {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6694.0332092359895, "train/update_time": 4072.1044038116815, "train/lr": 0.00036829305408417155, "train/loss": 4.391136646270752, "train/global_grad_norm": 0.47766736149787903}
60
- {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6763.143532235001, "train/update_time": 4141.075749416603, "train/lr": 0.000353055665302672, "train/loss": 4.387944221496582, "train/global_grad_norm": 0.4765089154243469}
61
- {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6922.65050104697, "train/update_time": 4210.039972436498, "train/lr": 0.0003379662370102746, "train/loss": 4.3538899421691895, "train/global_grad_norm": 0.41657164692878723}
62
- {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6991.763516802981, "train/update_time": 4279.012207661464, "train/lr": 0.00032303996298219405, "train/loss": 4.328833103179932, "train/global_grad_norm": 0.44749119877815247}
63
- {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7150.874404011003, "train/update_time": 4347.981613874435, "train/lr": 0.00030829187271113034, "train/loss": 4.339343070983887, "train/global_grad_norm": 0.4195193946361542}
64
- {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7219.974368761992, "train/update_time": 4416.951972602401, "train/lr": 0.0002937368162738445, "train/loss": 4.329409599304199, "train/global_grad_norm": 0.43621429800987244}
65
- {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7379.584015795961, "train/update_time": 4485.922716939414, "train/lr": 0.0002793894493783894, "train/loss": 4.303433418273926, "train/global_grad_norm": 0.4920794367790222}
66
- {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7448.697525598982, "train/update_time": 4554.898137903423, "train/lr": 0.00026526421860705474, "train/loss": 4.324357986450195, "train/global_grad_norm": 0.39209991693496704}
67
- {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7607.930423379992, "train/update_time": 4623.874382758397, "train/lr": 0.0002513753468698824, "train/loss": 4.26851749420166, "train/global_grad_norm": 0.4350663721561432}
68
- {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7677.034741098003, "train/update_time": 4692.8429151014425, "train/lr": 0.00023773681908340283, "train/loss": 4.282830238342285, "train/global_grad_norm": 0.39358457922935486}
69
- {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7836.185687895981, "train/update_time": 4761.811451301328, "train/lr": 0.00022436236808900823, "train/loss": 4.284267902374268, "train/global_grad_norm": 0.4044873118400574}
70
- {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 7905.308589838969, "train/update_time": 4830.788480303425, "train/lr": 0.00021126546082514682, "train/loss": 4.279461860656738, "train/global_grad_norm": 0.41764646768569946}
71
- {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8064.825394029962, "train/update_time": 4899.746858645405, "train/lr": 0.00019845928476725522, "train/loss": 4.275580406188965, "train/global_grad_norm": 0.34193727374076843}
72
- {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8133.938365876966, "train/update_time": 4968.716519999434, "train/lr": 0.0001859567346490913, "train/loss": 4.250948429107666, "train/global_grad_norm": 0.37154102325439453}
73
- {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8293.024653506, "train/update_time": 5037.670779642533, "train/lr": 0.00017377039947882782, "train/loss": 4.268786907196045, "train/global_grad_norm": 0.41168051958084106}
74
- {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8362.121594669996, "train/update_time": 5106.633266707533, "train/lr": 0.00016191254986299043, "train/loss": 4.253113746643066, "train/global_grad_norm": 0.358982115983963}
75
- {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8521.302722692955, "train/update_time": 5175.6192292046035, "train/lr": 0.00015039512565099468, "train/loss": 4.235835075378418, "train/global_grad_norm": 0.3571152091026306}
76
- {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8590.419437660952, "train/update_time": 5244.595247996622, "train/lr": 0.00013922972391273224, "train/loss": 4.197190761566162, "train/global_grad_norm": 0.34816187620162964}
77
- {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 8749.521732740977, "train/update_time": 5313.566678232513, "train/lr": 0.00012842758726130281, "train/loss": 4.2619099617004395, "train/global_grad_norm": 0.31704217195510864}
78
- {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 8818.629101625993, "train/update_time": 5382.53907933255, "train/lr": 0.00011799959253265679, "train/loss": 4.183486461639404, "train/global_grad_norm": 0.3519703447818756}
79
- {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 8977.726054205967, "train/update_time": 5451.502941412618, "train/lr": 0.00010795623983354214, "train/loss": 4.21256685256958, "train/global_grad_norm": 0.3151834309101105}
80
- {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9046.829455698957, "train/update_time": 5520.4718770905165, "train/lr": 9.830764196878872e-05, "train/loss": 4.190575122833252, "train/global_grad_norm": 0.3258683383464813}
81
- {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9206.457426826004, "train/update_time": 5589.4371389435255, "train/lr": 8.906351425856951e-05, "train/loss": 4.166137218475342, "train/global_grad_norm": 0.29673388600349426}
82
- {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9275.580202061974, "train/update_time": 5658.411268384545, "train/lr": 8.02331647558977e-05, "train/loss": 4.178226470947266, "train/global_grad_norm": 0.2848501205444336}
83
- {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9434.65409148595, "train/update_time": 5727.392325014458, "train/lr": 7.182548487420554e-05, "train/loss": 4.2109150886535645, "train/global_grad_norm": 0.29233965277671814}
84
- {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9503.765275373997, "train/update_time": 5796.35883763741, "train/lr": 6.384894043444556e-05, "train/loss": 4.159761428833008, "train/global_grad_norm": 0.30078691244125366}
85
- {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9662.829677159956, "train/update_time": 5865.338317954331, "train/lr": 5.6311563140726166e-05, "train/loss": 4.228906631469727, "train/global_grad_norm": 0.2724829316139221}
86
- {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 9731.945241109002, "train/update_time": 5934.31139906036, "train/lr": 4.922094249306547e-05, "train/loss": 4.2079997062683105, "train/global_grad_norm": 0.25332000851631165}
87
- {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 9891.181177204999, "train/update_time": 6003.2874812923255, "train/lr": 4.2584218145409916e-05, "train/loss": 4.153715133666992, "train/global_grad_norm": 0.2573024034500122}
88
- {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 9960.290040618973, "train/update_time": 6072.254921466229, "train/lr": 3.6408072716606236e-05, "train/loss": 4.171263694763184, "train/global_grad_norm": 0.27709466218948364}
89
- {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10119.394410843961, "train/update_time": 6141.227180292248, "train/lr": 3.069872506157217e-05, "train/loss": 4.226706504821777, "train/global_grad_norm": 0.25968244671821594}
90
- {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10188.515749696002, "train/update_time": 6210.206845312321, "train/lr": 2.5461924009435368e-05, "train/loss": 4.141597270965576, "train/global_grad_norm": 0.2525114119052887}
91
- {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10348.059376804973, "train/update_time": 6279.188010795449, "train/lr": 2.0702942574950812e-05, "train/loss": 4.176564693450928, "train/global_grad_norm": 0.2523636221885681}
92
- {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10417.185683044954, "train/update_time": 6348.16824121651, "train/lr": 1.642657264902142e-05, "train/loss": 4.20511531829834, "train/global_grad_norm": 0.23348693549633026}
93
- {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 10576.369776681007, "train/update_time": 6417.1375977324205, "train/lr": 1.2637120173670358e-05, "train/loss": 4.189664363861084, "train/global_grad_norm": 0.22364211082458496}
94
- {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 10645.485156638955, "train/update_time": 6486.112916803453, "train/lr": 9.338400806321978e-06, "train/loss": 4.146772384643555, "train/global_grad_norm": 0.22672487795352936}
95
- {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 10804.645974584972, "train/update_time": 6555.09181263647, "train/lr": 6.533736077758867e-06, "train/loss": 4.168900966644287, "train/global_grad_norm": 0.22601623833179474}
96
- {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 10873.77756452799, "train/update_time": 6624.0805201563635, "train/lr": 4.2259500476214406e-06, "train/loss": 4.167483329772949, "train/global_grad_norm": 0.21457543969154358}
97
- {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 11032.958788217977, "train/update_time": 6693.060108309321, "train/lr": 2.417366460819359e-06, "train/loss": 4.191482067108154, "train/global_grad_norm": 0.21356524527072906}
98
- {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 11102.099602014001, "train/update_time": 6762.03791546938, "train/lr": 1.1098064077174619e-06, "train/loss": 4.166873455047607, "train/global_grad_norm": 0.20960550010204315}
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 57.12486812804127, "train/update_time": 56.932655182085, "train/lr": 0.0009000000000000001, "train/loss": 10.077424049377441, "train/global_grad_norm": 1.0569149255752563}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 110.51794190204237, "train/update_time": 110.19485108501976, "train/lr": 0.0009997960964140947, "train/loss": 8.169595718383789, "train/global_grad_norm": 0.6573789119720459}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 263.8700318510528, "train/update_time": 163.45965232816525, "train/lr": 0.0009990914580222257, "train/loss": 7.759955406188965, "train/global_grad_norm": 0.28603488206863403}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 317.24918574804906, "train/update_time": 216.73109497816768, "train/lr": 0.0009978842768382998, "train/loss": 7.5351409912109375, "train/global_grad_norm": 0.2373751848936081}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 470.2395834400086, "train/update_time": 270.0089025082416, "train/lr": 0.0009961757683914405, "train/loss": 7.356375694274902, "train/global_grad_norm": 0.25599583983421326}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 523.6280801940011, "train/update_time": 323.28231063415296, "train/lr": 0.00099396765300483, "train/loss": 7.169342041015625, "train/global_grad_norm": 0.21762162446975708}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 676.4629730410525, "train/update_time": 376.5575643811608, "train/lr": 0.0009912621540634887, "train/loss": 7.04250955581665, "train/global_grad_norm": 0.17491649091243744}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 729.8428020050051, "train/update_time": 429.83724463917315, "train/lr": 0.000988061995775515, "train/loss": 6.879690647125244, "train/global_grad_norm": 0.17261892557144165}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 884.0807184120058, "train/update_time": 483.11551754607353, "train/lr": 0.0009843704004290394, "train/loss": 6.732751369476318, "train/global_grad_norm": 0.32638832926750183}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 937.4650811910396, "train/update_time": 536.3882422860479, "train/lr": 0.0009801910851476522, "train/loss": 6.633055210113525, "train/global_grad_norm": 0.18298597633838654}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1091.3179251340334, "train/update_time": 589.6708546730806, "train/lr": 0.0009755282581475768, "train/loss": 6.5601725578308105, "train/global_grad_norm": 0.7863500714302063}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1144.6911778100184, "train/update_time": 642.9422557381331, "train/lr": 0.0009703866145003512, "train/loss": 6.427718162536621, "train/global_grad_norm": 0.2532098889350891}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1298.4373718530405, "train/update_time": 696.2163101581973, "train/lr": 0.0009647713314052896, "train/loss": 6.381680488586426, "train/global_grad_norm": 0.1815725564956665}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1351.811079526029, "train/update_time": 749.4968144011218, "train/lr": 0.0009586880629764817, "train/loss": 6.308362007141113, "train/global_grad_norm": 0.25452741980552673}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1504.7933711430524, "train/update_time": 802.7697926640394, "train/lr": 0.0009521429345495787, "train/loss": 6.192831039428711, "train/global_grad_norm": 0.2731724679470062}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1558.167400816048, "train/update_time": 856.0381595880608, "train/lr": 0.0009451425365140996, "train/loss": 6.160712242126465, "train/global_grad_norm": 0.25031647086143494}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1711.111571622023, "train/update_time": 909.3183224739623, "train/lr": 0.000937693917677468, "train/loss": 6.076303005218506, "train/global_grad_norm": 0.22126874327659607}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1764.4956908360473, "train/update_time": 962.5948072728934, "train/lr": 0.0009298045781674596, "train/loss": 6.05035400390625, "train/global_grad_norm": 0.20042574405670166}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1917.3773277360015, "train/update_time": 1015.8676893726224, "train/lr": 0.0009214824618802108, "train/loss": 6.025995254516602, "train/global_grad_norm": 0.4825673997402191}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1970.7506705410196, "train/update_time": 1069.132912081608, "train/lr": 0.000912735948481387, "train/loss": 5.939733505249023, "train/global_grad_norm": 0.201382115483284}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2124.242168805038, "train/update_time": 1122.4119238386047, "train/lr": 0.0009035738449685707, "train/loss": 5.90223503112793, "train/global_grad_norm": 0.44680795073509216}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2177.6055200890405, "train/update_time": 1175.6836349036312, "train/lr": 0.0008940053768033609, "train/loss": 5.879762172698975, "train/global_grad_norm": 0.24954979121685028}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2330.6941529700416, "train/update_time": 1228.951100654609, "train/lr": 0.0008840401786221159, "train/loss": 5.81107234954834, "train/global_grad_norm": 0.2971765398979187}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2384.0519924180117, "train/update_time": 1282.2238651026273, "train/lr": 0.0008736882845346905, "train/loss": 5.7556939125061035, "train/global_grad_norm": 0.2694259583950043}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2537.098761650035, "train/update_time": 1335.4923671315191, "train/lr": 0.0008629601180209381, "train/loss": 5.76292610168457, "train/global_grad_norm": 0.3520471751689911}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2590.4739345350536, "train/update_time": 1388.759745098534, "train/lr": 0.0008518664814351503, "train/loss": 5.717782974243164, "train/global_grad_norm": 0.4163813591003418}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2743.4162192750373, "train/update_time": 1442.0279306704178, "train/lr": 0.0008404185451290017, "train/loss": 5.692546844482422, "train/global_grad_norm": 0.21434997022151947}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2796.791148387012, "train/update_time": 1495.3029013883206, "train/lr": 0.0008286278362039527, "train/loss": 5.643004894256592, "train/global_grad_norm": 0.2754496932029724}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2949.6854955510353, "train/update_time": 1548.5792963503627, "train/lr": 0.0008165062269044352, "train/loss": 5.610556125640869, "train/global_grad_norm": 0.2890426218509674}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3003.058338998002, "train/update_time": 1601.8528411513544, "train/lr": 0.0008040659226635089, "train/loss": 5.58476448059082, "train/global_grad_norm": 0.380667507648468}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3156.3314061540295, "train/update_time": 1655.1285377033055, "train/lr": 0.0007913194498130252, "train/loss": 5.5936055183410645, "train/global_grad_norm": 0.2591659426689148}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3209.7035325120087, "train/update_time": 1708.4027871834696, "train/lr": 0.000778279642970672, "train/loss": 5.527544021606445, "train/global_grad_norm": 0.22509922087192535}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3362.512193232018, "train/update_time": 1761.6806279715383, "train/lr": 0.0007649596321166025, "train/loss": 5.541203022003174, "train/global_grad_norm": 0.4492305815219879}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3415.8845459170407, "train/update_time": 1814.9587236176012, "train/lr": 0.0007513728293726579, "train/loss": 5.501528263092041, "train/global_grad_norm": 0.3490087687969208}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3570.6092982320115, "train/update_time": 1868.2295263125561, "train/lr": 0.0007375329154974975, "train/loss": 5.4834885597229, "train/global_grad_norm": 0.3601242005825043}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 3623.980893074011, "train/update_time": 1921.4990626386134, "train/lr": 0.0007234538261112341, "train/loss": 5.410107612609863, "train/global_grad_norm": 0.4656950533390045}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 3779.0529326410033, "train/update_time": 1974.7656041345908, "train/lr": 0.0007091497376634464, "train/loss": 5.43698263168335, "train/global_grad_norm": 0.4004105031490326}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 3832.4348147350247, "train/update_time": 2028.0436954226461, "train/lr": 0.0006946350531586958, "train/loss": 5.412634372711182, "train/global_grad_norm": 0.29114505648612976}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 3986.6730023160344, "train/update_time": 2081.339985151775, "train/lr": 0.0006799243876539214, "train/loss": 5.414259910583496, "train/global_grad_norm": 0.3105607330799103}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4040.0474286440294, "train/update_time": 2134.611642122676, "train/lr": 0.0006650325535423166, "train/loss": 5.334737300872803, "train/global_grad_norm": 0.30427536368370056}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4193.527929550037, "train/update_time": 2187.884143058851, "train/lr": 0.0006499745456385053, "train/loss": 5.344532489776611, "train/global_grad_norm": 0.33702728152275085}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4246.9178847110015, "train/update_time": 2241.15703301772, "train/lr": 0.0006347655260800339, "train/loss": 5.3456807136535645, "train/global_grad_norm": 0.31774818897247314}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4400.437362540048, "train/update_time": 2294.435530113755, "train/lr": 0.0006194208090603844, "train/loss": 5.333301544189453, "train/global_grad_norm": 0.3481753468513489}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4453.811728182016, "train/update_time": 2347.708159423666, "train/lr": 0.0006039558454088796, "train/loss": 5.352600574493408, "train/global_grad_norm": 0.25959765911102295}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 4608.291631601052, "train/update_time": 2400.977511668694, "train/lr": 0.0005883862070330078, "train/loss": 5.296506881713867, "train/global_grad_norm": 0.3832894265651703}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 4661.668510478048, "train/update_time": 2454.2510100168292, "train/lr": 0.0005727275712388317, "train/loss": 5.275446891784668, "train/global_grad_norm": 0.42716965079307556}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 4816.396098136029, "train/update_time": 2507.5231073708273, "train/lr": 0.0005569957049452703, "train/loss": 5.291362285614014, "train/global_grad_norm": 0.382432758808136}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 4869.772359199007, "train/update_time": 2560.802637038869, "train/lr": 0.0005412064488081482, "train/loss": 5.263927936553955, "train/global_grad_norm": 0.2995292842388153}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5024.381550224032, "train/update_time": 2614.0767399497563, "train/lr": 0.0005253757012699972, "train/loss": 5.269484043121338, "train/global_grad_norm": 0.32839062809944153}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5077.749329403043, "train/update_time": 2667.341171991662, "train/lr": 0.0005095194025516734, "train/loss": 5.244964122772217, "train/global_grad_norm": 0.28380733728408813}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5232.712933773, "train/update_time": 2720.6185927585466, "train/lr": 0.0004936535186019053, "train/loss": 5.242177963256836, "train/global_grad_norm": 0.3184642493724823}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5286.093386778026, "train/update_time": 2773.886181908485, "train/lr": 0.00047779402502093696, "train/loss": 5.2353105545043945, "train/global_grad_norm": 0.39006081223487854}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 5441.220508124039, "train/update_time": 2827.1616227625636, "train/lr": 0.0004619568909744525, "train/loss": 5.193413734436035, "train/global_grad_norm": 0.33428674936294556}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 5494.618768353015, "train/update_time": 2880.4332658784697, "train/lr": 0.00044615806311398067, "train/loss": 5.208868503570557, "train/global_grad_norm": 0.331386536359787}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 5649.622147237009, "train/update_time": 2933.705731303431, "train/lr": 0.0004304134495199673, "train/loss": 5.155785083770752, "train/global_grad_norm": 0.31903383135795593}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 5703.016606429999, "train/update_time": 2986.9915024373913, "train/lr": 0.0004147389036836882, "train/loss": 5.194952011108398, "train/global_grad_norm": 0.2875197231769562}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 5856.936125319044, "train/update_time": 3040.2605143213877, "train/lr": 0.0003991502085441259, "train/loss": 5.166281223297119, "train/global_grad_norm": 0.2693222165107727}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 5910.318671693036, "train/update_time": 3093.5336661074543, "train/lr": 0.0003836630605958888, "train/loss": 5.196907043457031, "train/global_grad_norm": 0.28884732723236084}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6063.751631618012, "train/update_time": 3146.8101018704474, "train/lr": 0.00036829305408417155, "train/loss": 5.1710686683654785, "train/global_grad_norm": 0.2542065382003784}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6117.130997166038, "train/update_time": 3200.0810677845147, "train/lr": 0.000353055665302672, "train/loss": 5.1745829582214355, "train/global_grad_norm": 0.2485460489988327}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6270.89233304502, "train/update_time": 3253.376109398436, "train/lr": 0.0003379662370102746, "train/loss": 5.1606316566467285, "train/global_grad_norm": 0.24228136241436005}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6324.271360302053, "train/update_time": 3306.647373120475, "train/lr": 0.00032303996298219405, "train/loss": 5.135626316070557, "train/global_grad_norm": 0.297547847032547}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 6477.804279661039, "train/update_time": 3359.92066662648, "train/lr": 0.00030829187271113034, "train/loss": 5.145682334899902, "train/global_grad_norm": 0.2230217158794403}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 6531.185974933032, "train/update_time": 3413.19108713849, "train/lr": 0.0002937368162738445, "train/loss": 5.152654647827148, "train/global_grad_norm": 0.2998616695404053}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 6684.524030425004, "train/update_time": 3466.4617998044705, "train/lr": 0.0002793894493783894, "train/loss": 5.121702194213867, "train/global_grad_norm": 0.24779871106147766}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 6737.915407613036, "train/update_time": 3519.7321525084553, "train/lr": 0.00026526421860705474, "train/loss": 5.143693447113037, "train/global_grad_norm": 0.25348708033561707}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 6891.055202779011, "train/update_time": 3573.0133047814597, "train/lr": 0.0002513753468698824, "train/loss": 5.096343040466309, "train/global_grad_norm": 0.23598778247833252}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 6944.4543808570015, "train/update_time": 3626.2884129853337, "train/lr": 0.00023773681908340283, "train/loss": 5.104409217834473, "train/global_grad_norm": 0.25248244404792786}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7097.769050646049, "train/update_time": 3679.5599112784257, "train/lr": 0.00022436236808900823, "train/loss": 5.114173889160156, "train/global_grad_norm": 0.1796308010816574}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 7151.144483595039, "train/update_time": 3732.833151328552, "train/lr": 0.00021126546082514682, "train/loss": 5.100642681121826, "train/global_grad_norm": 0.21071678400039673}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 7304.954160230001, "train/update_time": 3786.1120017025387, "train/lr": 0.00019845928476725522, "train/loss": 5.092477321624756, "train/global_grad_norm": 0.2642405927181244}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 7358.337214609026, "train/update_time": 3839.3859579174896, "train/lr": 0.0001859567346490913, "train/loss": 5.1006293296813965, "train/global_grad_norm": 0.2527276873588562}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 7511.621099121054, "train/update_time": 3892.6592194504337, "train/lr": 0.00017377039947882782, "train/loss": 5.108747959136963, "train/global_grad_norm": 0.18992801010608673}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 7565.03086849401, "train/update_time": 3945.9412919793394, "train/lr": 0.00016191254986299043, "train/loss": 5.105123043060303, "train/global_grad_norm": 0.19203054904937744}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 7718.234012908011, "train/update_time": 3999.225145033328, "train/lr": 0.00015039512565099468, "train/loss": 5.096941947937012, "train/global_grad_norm": 0.18585249781608582}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 7771.636799781001, "train/update_time": 4052.503600837372, "train/lr": 0.00013922972391273224, "train/loss": 5.052731990814209, "train/global_grad_norm": 0.20207689702510834}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 7924.894128009037, "train/update_time": 4105.784444000397, "train/lr": 0.00012842758726130281, "train/loss": 5.112724304199219, "train/global_grad_norm": 0.20703168213367462}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 7978.290488699044, "train/update_time": 4159.058140857378, "train/lr": 0.00011799959253265679, "train/loss": 5.0365166664123535, "train/global_grad_norm": 0.19214113056659698}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 8133.211715042999, "train/update_time": 4212.332782215439, "train/lr": 0.00010795623983354214, "train/loss": 5.071290016174316, "train/global_grad_norm": 0.18038234114646912}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 8186.5969784220215, "train/update_time": 4265.60545003548, "train/lr": 9.830764196878872e-05, "train/loss": 5.038881301879883, "train/global_grad_norm": 0.21081118285655975}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 8342.015340365004, "train/update_time": 4318.878572056361, "train/lr": 8.906351425856951e-05, "train/loss": 5.020392894744873, "train/global_grad_norm": 0.14397121965885162}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 8395.40680388402, "train/update_time": 4372.159924876352, "train/lr": 8.02331647558977e-05, "train/loss": 5.033623695373535, "train/global_grad_norm": 0.20169667899608612}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 8550.23874416505, "train/update_time": 4425.43352887634, "train/lr": 7.182548487420554e-05, "train/loss": 5.0619354248046875, "train/global_grad_norm": 0.1709950715303421}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 8603.637448858004, "train/update_time": 4478.715910169412, "train/lr": 6.384894043444556e-05, "train/loss": 5.039224147796631, "train/global_grad_norm": 0.14815856516361237}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 8756.833522661007, "train/update_time": 4531.9930339534185, "train/lr": 5.6311563140726166e-05, "train/loss": 5.098855495452881, "train/global_grad_norm": 0.14111174643039703}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 8810.230298971, "train/update_time": 4585.266904477379, "train/lr": 4.922094249306547e-05, "train/loss": 5.075422286987305, "train/global_grad_norm": 0.1590386927127838}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 8963.212115761009, "train/update_time": 4638.541050948435, "train/lr": 4.2584218145409916e-05, "train/loss": 5.018501281738281, "train/global_grad_norm": 0.14633402228355408}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 9016.607044072007, "train/update_time": 4691.8174923404, "train/lr": 3.6408072716606236e-05, "train/loss": 5.05587911605835, "train/global_grad_norm": 0.1401192992925644}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 9169.565102360037, "train/update_time": 4745.092234898475, "train/lr": 3.069872506157217e-05, "train/loss": 5.094274044036865, "train/global_grad_norm": 0.14165186882019043}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 9222.955616571999, "train/update_time": 4798.3678903255495, "train/lr": 2.5461924009435368e-05, "train/loss": 5.02672815322876, "train/global_grad_norm": 0.1492566168308258}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 9376.841395194002, "train/update_time": 4851.6424762834795, "train/lr": 2.0702942574950812e-05, "train/loss": 5.048681735992432, "train/global_grad_norm": 0.1326991319656372}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 9430.23481100105, "train/update_time": 4904.91280556639, "train/lr": 1.642657264902142e-05, "train/loss": 5.089313983917236, "train/global_grad_norm": 0.1482771933078766}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 9583.694742296997, "train/update_time": 4958.171936342376, "train/lr": 1.2637120173670358e-05, "train/loss": 5.060367584228516, "train/global_grad_norm": 0.13109560310840607}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 9637.072636537021, "train/update_time": 5011.441395019239, "train/lr": 9.338400806321978e-06, "train/loss": 5.0208845138549805, "train/global_grad_norm": 0.12541547417640686}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 9791.05162328505, "train/update_time": 5064.716082916246, "train/lr": 6.533736077758867e-06, "train/loss": 5.054846286773682, "train/global_grad_norm": 0.12279438227415085}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 9844.444105764036, "train/update_time": 5117.988080174255, "train/lr": 4.2259500476214406e-06, "train/loss": 5.061497211456299, "train/global_grad_norm": 0.11868078261613846}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 9997.457464576, "train/update_time": 5171.263283661159, "train/lr": 2.417366460819359e-06, "train/loss": 5.073247909545898, "train/global_grad_norm": 0.12584054470062256}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 10050.842236216005, "train/update_time": 5224.530718925176, "train/lr": 1.1098064077174619e-06, "train/loss": 5.0577826499938965, "train/global_grad_norm": 0.11671043187379837}
metrics/jsonlines/train_eval.jsonl CHANGED
@@ -1,19 +1,19 @@
1
- {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 529.2283039629692, "train_eval/train_update_time": 348.5810220290441, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262740562529434, "train_eval/perplexity_len_2048": 3876.703904883719, "train_eval/loss_avg_len_1024": 8.26358847254669, "train_eval/perplexity_len_1024": 3879.992394933413, "train_eval/loss_avg_len_512": 8.264395577695833, "train_eval/perplexity_len_512": 3883.125220863903}
2
- {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1054.376383124967, "train_eval/train_update_time": 693.2531012066174, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.399058190044452, "train_eval/perplexity_len_2048": 601.2784810600184, "train_eval/loss_avg_len_1024": 6.403318745188226, "train_eval/perplexity_len_1024": 603.8457262467264, "train_eval/loss_avg_len_512": 6.409655180920672, "train_eval/perplexity_len_512": 607.6841038572269}
3
- {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1670.3140526569914, "train_eval/train_update_time": 1037.9416665838216, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.696208518195235, "train_eval/perplexity_len_2048": 297.73639610132165, "train_eval/loss_avg_len_1024": 5.702004268196542, "train_eval/perplexity_len_1024": 299.46701208382314, "train_eval/loss_avg_len_512": 5.713629163519217, "train_eval/perplexity_len_512": 302.96859810428947}
4
- {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2196.1893687059637, "train_eval/train_update_time": 1382.727176492801, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.298926785588973, "train_eval/perplexity_len_2048": 200.12192095488965, "train_eval/loss_avg_len_1024": 5.3072640442429835, "train_eval/perplexity_len_1024": 201.79736376729673, "train_eval/loss_avg_len_512": 5.32238381281153, "train_eval/perplexity_len_512": 204.8716760832407}
5
- {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2812.5629005369847, "train_eval/train_update_time": 1727.488953433698, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.045949606849063, "train_eval/perplexity_len_2048": 155.3917902663376, "train_eval/loss_avg_len_1024": 5.054079064009093, "train_eval/perplexity_len_1024": 156.66018988300135, "train_eval/loss_avg_len_512": 5.071243590088998, "train_eval/perplexity_len_512": 159.3723980930882}
6
- {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3338.075166533992, "train_eval/train_update_time": 2072.2588375147316, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.879746701723743, "train_eval/perplexity_len_2048": 131.59732628792167, "train_eval/loss_avg_len_1024": 4.887840953422929, "train_eval/perplexity_len_1024": 132.66683074977718, "train_eval/loss_avg_len_512": 4.906784731852021, "train_eval/perplexity_len_512": 135.2039976855768}
7
- {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3954.2480487469584, "train_eval/train_update_time": 2417.024196324637, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.75083749865229, "train_eval/perplexity_len_2048": 115.68112675666295, "train_eval/loss_avg_len_1024": 4.762147275177813, "train_eval/perplexity_len_1024": 116.99688086604426, "train_eval/loss_avg_len_512": 4.784011943052464, "train_eval/perplexity_len_512": 119.58314973081396}
8
- {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4480.02484513697, "train_eval/train_update_time": 2761.798597707704, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.651610771730466, "train_eval/perplexity_len_2048": 104.75358386553785, "train_eval/loss_avg_len_1024": 4.662627696003729, "train_eval/perplexity_len_1024": 105.91402668452575, "train_eval/loss_avg_len_512": 4.68646468473402, "train_eval/perplexity_len_512": 108.46902894640158}
9
- {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5096.086593801971, "train_eval/train_update_time": 3106.58031973982, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.568712306133184, "train_eval/perplexity_len_2048": 96.4198705230403, "train_eval/loss_avg_len_1024": 4.583387268669976, "train_eval/perplexity_len_1024": 97.84526171163627, "train_eval/loss_avg_len_512": 4.611602480377551, "train_eval/perplexity_len_512": 100.64530260218329}
10
- {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5621.47650762595, "train_eval/train_update_time": 3451.396044731722, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.499271789527883, "train_eval/perplexity_len_2048": 89.95160374466079, "train_eval/loss_avg_len_1024": 4.5137719763946365, "train_eval/perplexity_len_1024": 91.26542108671948, "train_eval/loss_avg_len_512": 4.544290530496583, "train_eval/perplexity_len_512": 94.0936469499529}
11
- {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6237.341601410008, "train_eval/train_update_time": 3796.2288036436657, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.433857150532786, "train_eval/perplexity_len_2048": 84.25577817268656, "train_eval/loss_avg_len_1024": 4.448194169824673, "train_eval/perplexity_len_1024": 85.4724558145983, "train_eval/loss_avg_len_512": 4.479190395291953, "train_eval/perplexity_len_512": 88.16326637857776}
12
- {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6763.143532235001, "train_eval/train_update_time": 4141.075749416603, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.377449188591417, "train_eval/perplexity_len_2048": 79.63464115655327, "train_eval/loss_avg_len_1024": 4.392922887768108, "train_eval/perplexity_len_1024": 80.87646667714118, "train_eval/loss_avg_len_512": 4.428195524361727, "train_eval/perplexity_len_512": 83.7801012769142}
13
- {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7379.584015795961, "train_eval/train_update_time": 4485.922716939414, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.331950723144054, "train_eval/perplexity_len_2048": 76.09257743684577, "train_eval/loss_avg_len_1024": 4.353910537918127, "train_eval/perplexity_len_1024": 77.7820385775537, "train_eval/loss_avg_len_512": 4.393551132318316, "train_eval/perplexity_len_512": 80.92729284052949}
14
- {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7905.308589838969, "train_eval/train_update_time": 4830.788480303425, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.285456626634223, "train_eval/perplexity_len_2048": 72.63570654655291, "train_eval/loss_avg_len_1024": 4.308664352084252, "train_eval/perplexity_len_1024": 74.3411290261473, "train_eval/loss_avg_len_512": 4.353479764840158, "train_eval/perplexity_len_512": 77.74853938517812}
15
- {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8521.302722692955, "train_eval/train_update_time": 5175.6192292046035, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.248288494520384, "train_eval/perplexity_len_2048": 69.9855291689276, "train_eval/loss_avg_len_1024": 4.272928099082565, "train_eval/perplexity_len_1024": 71.73136495915584, "train_eval/loss_avg_len_512": 4.320970644394619, "train_eval/perplexity_len_512": 75.26164514357997}
16
- {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9046.829455698957, "train_eval/train_update_time": 5520.4718770905165, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.221004894153129, "train_eval/perplexity_len_2048": 68.10188510203145, "train_eval/loss_avg_len_1024": 4.248565303894974, "train_eval/perplexity_len_1024": 70.00490450100025, "train_eval/loss_avg_len_512": 4.299022679186128, "train_eval/perplexity_len_512": 73.6278005431706}
17
- {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9662.829677159956, "train_eval/train_update_time": 5865.338317954331, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.193981613898331, "train_eval/perplexity_len_2048": 66.28619225565306, "train_eval/loss_avg_len_1024": 4.21717632021635, "train_eval/perplexity_len_1024": 67.84165048130349, "train_eval/loss_avg_len_512": 4.269808194869183, "train_eval/perplexity_len_512": 71.50791871807111}
18
- {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10188.515749696002, "train_eval/train_update_time": 6210.206845312321, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.1873091592156015, "train_eval/perplexity_len_2048": 65.84537295065353, "train_eval/loss_avg_len_1024": 4.213890584899655, "train_eval/perplexity_len_1024": 67.61910658476074, "train_eval/loss_avg_len_512": 4.267942122658933, "train_eval/perplexity_len_512": 71.37460420407933}
19
- {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10804.645974584972, "train_eval/train_update_time": 6555.09181263647, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.174864357211755, "train_eval/perplexity_len_2048": 65.03101807412418, "train_eval/loss_avg_len_1024": 4.204799810601998, "train_eval/perplexity_len_1024": 67.00718219606644, "train_eval/loss_avg_len_512": 4.25834744757838, "train_eval/perplexity_len_512": 70.69306287994196}
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 470.2395834400086, "train_eval/train_update_time": 270.0089025082416, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.520986258850897, "train_eval/perplexity_len_2048": 5019.001351094721, "train_eval/loss_avg_len_1024": 8.522497836221008, "train_eval/perplexity_len_1024": 5026.593696720888, "train_eval/loss_avg_len_512": 8.524103445280343, "train_eval/perplexity_len_512": 5034.6709237971745}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 937.4650811910396, "train_eval/train_update_time": 536.3882422860479, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.966422270688636, "train_eval/perplexity_len_2048": 1060.4220525980056, "train_eval/loss_avg_len_1024": 6.9722344631998565, "train_eval/perplexity_len_1024": 1066.6033758299184, "train_eval/loss_avg_len_512": 6.982165958659024, "train_eval/perplexity_len_512": 1077.249118985535}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1504.7933711430524, "train_eval/train_update_time": 802.7697926640394, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.408694662143243, "train_eval/perplexity_len_2048": 607.100692098556, "train_eval/loss_avg_len_1024": 6.417305716021801, "train_eval/perplexity_len_1024": 612.3510419477873, "train_eval/loss_avg_len_512": 6.434781106839073, "train_eval/perplexity_len_512": 623.1461654998286}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1970.7506705410196, "train_eval/train_update_time": 1069.132912081608, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.078357431389886, "train_eval/perplexity_len_2048": 436.31193351807616, "train_eval/loss_avg_len_1024": 6.088771626625675, "train_eval/perplexity_len_1024": 440.87951373197876, "train_eval/loss_avg_len_512": 6.107667324735084, "train_eval/perplexity_len_512": 449.28944546444586}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2537.098761650035, "train_eval/train_update_time": 1335.4923671315191, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.844426090481538, "train_eval/perplexity_len_2048": 345.3043114722714, "train_eval/loss_avg_len_1024": 5.8543462534093855, "train_eval/perplexity_len_1024": 348.746833459867, "train_eval/loss_avg_len_512": 5.874322384678671, "train_eval/perplexity_len_512": 355.78349462406976}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3003.058338998002, "train_eval/train_update_time": 1601.8528411513544, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.678305028288451, "train_eval/perplexity_len_2048": 292.4533094997349, "train_eval/loss_avg_len_1024": 5.688276190591132, "train_eval/perplexity_len_1024": 295.38399580743766, "train_eval/loss_avg_len_512": 5.709491532435932, "train_eval/perplexity_len_512": 301.71761565218657}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3570.6092982320115, "train_eval/train_update_time": 1868.2295263125561, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.536149771192977, "train_eval/perplexity_len_2048": 253.69931617483144, "train_eval/loss_avg_len_1024": 5.548100212041172, "train_eval/perplexity_len_1024": 256.74932301083805, "train_eval/loss_avg_len_512": 5.5710370129648075, "train_eval/perplexity_len_512": 262.7063879626648}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4040.0474286440294, "train_eval/train_update_time": 2134.611642122676, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.429012276458179, "train_eval/perplexity_len_2048": 227.92400829756792, "train_eval/loss_avg_len_1024": 5.44054473506887, "train_eval/perplexity_len_1024": 230.56774759580898, "train_eval/loss_avg_len_512": 5.463790042017644, "train_eval/perplexity_len_512": 235.9901441359267}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4608.291631601052, "train_eval/train_update_time": 2400.977511668694, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.341378729817461, "train_eval/perplexity_len_2048": 208.8003912516811, "train_eval/loss_avg_len_1024": 5.3547878493463585, "train_eval/perplexity_len_1024": 211.61907646489718, "train_eval/loss_avg_len_512": 5.3801865651574925, "train_eval/perplexity_len_512": 217.0627679970756}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5077.749329403043, "train_eval/train_update_time": 2667.341171991662, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.274126975246473, "train_eval/perplexity_len_2048": 195.21997022888468, "train_eval/loss_avg_len_1024": 5.286933790515396, "train_eval/perplexity_len_1024": 197.73619434175876, "train_eval/loss_avg_len_512": 5.313358115418087, "train_eval/perplexity_len_512": 203.0308860337738}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5649.622147237009, "train_eval/train_update_time": 2933.705731303431, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.216920477526092, "train_eval/perplexity_len_2048": 184.36555110302936, "train_eval/loss_avg_len_1024": 5.227359136996128, "train_eval/perplexity_len_1024": 186.30016010416225, "train_eval/loss_avg_len_512": 5.251418407165692, "train_eval/perplexity_len_512": 190.83676080814456}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6117.130997166038, "train_eval/train_update_time": 3200.0810677845147, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.171960423406071, "train_eval/perplexity_len_2048": 176.26004331317833, "train_eval/loss_avg_len_1024": 5.182277223829406, "train_eval/perplexity_len_1024": 178.0878955837198, "train_eval/loss_avg_len_512": 5.207336967919982, "train_eval/perplexity_len_512": 182.60712149488162}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6684.524030425004, "train_eval/train_update_time": 3466.4617998044705, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.14503268874938, "train_eval/perplexity_len_2048": 171.57709321999303, "train_eval/loss_avg_len_1024": 5.159728246359883, "train_eval/perplexity_len_1024": 174.11713221076468, "train_eval/loss_avg_len_512": 5.184838373988023, "train_eval/perplexity_len_512": 178.54459000759974}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7151.144483595039, "train_eval/train_update_time": 3732.833151328552, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.112421451262962, "train_eval/perplexity_len_2048": 166.07200365399407, "train_eval/loss_avg_len_1024": 5.126486331900814, "train_eval/perplexity_len_1024": 168.4242900992313, "train_eval/loss_avg_len_512": 5.1545387507501434, "train_eval/perplexity_len_512": 173.21589262959722}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7718.234012908011, "train_eval/train_update_time": 3999.225145033328, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.092018521135159, "train_eval/perplexity_len_2048": 162.71798046774887, "train_eval/loss_avg_len_1024": 5.106942968778494, "train_eval/perplexity_len_1024": 165.16466876932896, "train_eval/loss_avg_len_512": 5.13422550390591, "train_eval/perplexity_len_512": 169.73281155532834}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8186.5969784220215, "train_eval/train_update_time": 4265.60545003548, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.076504996041095, "train_eval/perplexity_len_2048": 160.21313075547536, "train_eval/loss_avg_len_1024": 5.0923651870700635, "train_eval/perplexity_len_1024": 162.77439902720477, "train_eval/loss_avg_len_512": 5.119650275185922, "train_eval/perplexity_len_512": 167.27685852206267}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8756.833522661007, "train_eval/train_update_time": 4531.9930339534185, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.060602921242007, "train_eval/perplexity_len_2048": 157.68555964213598, "train_eval/loss_avg_len_1024": 5.0714183489211795, "train_eval/perplexity_len_1024": 159.40025226107093, "train_eval/loss_avg_len_512": 5.098776889980873, "train_eval/perplexity_len_512": 163.82141309995419}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9222.955616571999, "train_eval/train_update_time": 4798.3678903255495, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.060850694276305, "train_eval/perplexity_len_2048": 157.7246347123879, "train_eval/loss_avg_len_1024": 5.075187498527157, "train_eval/perplexity_len_1024": 160.00218934191147, "train_eval/loss_avg_len_512": 5.102414626430763, "train_eval/perplexity_len_512": 164.4184374759526}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9791.05162328505, "train_eval/train_update_time": 5064.716082916246, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.051834876534377, "train_eval/perplexity_len_2048": 156.3090092520558, "train_eval/loss_avg_len_1024": 5.0681735769458465, "train_eval/perplexity_len_1024": 158.88387300919587, "train_eval/loss_avg_len_512": 5.094969020260178, "train_eval/perplexity_len_512": 163.19878869005936}
metrics/jsonlines/val.jsonl CHANGED
@@ -1,49 +1,49 @@
1
- {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 141.98572698398493, "val/train_update_time": 141.70471469813492, "val/loss": 8.017322055562005, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0004004840157, "val/val_tokens_per_second": 455109.0859565075, "val/loss_avg_len_2048": 8.017322055562005, "val/perplexity_len_2048": 3033.044124021049, "val/loss_avg_len_1024": 8.01611577590569, "val/perplexity_len_1024": 3029.387630417841, "val/loss_avg_len_512": 8.016580909416453, "val/perplexity_len_512": 3030.7970278754246}
2
- {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 370.1664184979745, "val/train_update_time": 279.6480940769543, "val/loss": 7.168812248389213, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.01553106895881, "val/val_tokens_per_second": 455032.587305645, "val/loss_avg_len_2048": 7.168812248389213, "val/perplexity_len_2048": 1298.301626800787, "val/loss_avg_len_1024": 7.169238402332319, "val/perplexity_len_1024": 1298.8550210655874, "val/loss_avg_len_512": 7.172547295653354, "val/perplexity_len_512": 1303.1599120545325}
3
- {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 598.2814449759899, "val/train_update_time": 417.52457091695396, "val/loss": 6.6822778238516545, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9535074569867, "val/val_tokens_per_second": 455346.3356566274, "val/loss_avg_len_2048": 6.6822778238516545, "val/perplexity_len_2048": 798.13505409571, "val/loss_avg_len_1024": 6.683723994776049, "val/perplexity_len_1024": 799.2901288211134, "val/loss_avg_len_512": 6.689726676506551, "val/perplexity_len_512": 804.102442017171}
4
- {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 826.3311064429581, "val/train_update_time": 555.3910710238852, "val/loss": 6.252672681268258, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.92056709702592, "val/val_tokens_per_second": 455513.1414574312, "val/loss_avg_len_2048": 6.252672681268258, "val/perplexity_len_2048": 519.3991596320534, "val/loss_avg_len_1024": 6.255661408025445, "val/perplexity_len_1024": 520.9538238741642, "val/loss_avg_len_512": 6.264640129434224, "val/perplexity_len_512": 525.6523850962309}
5
- {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1054.376383124967, "val/train_update_time": 693.2531012066174, "val/loss": 5.959701583172218, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.12631553603569, "val/val_tokens_per_second": 454473.2551906301, "val/loss_avg_len_2048": 5.959701583172218, "val/perplexity_len_2048": 387.4944721112208, "val/loss_avg_len_1024": 5.963742249931395, "val/perplexity_len_1024": 389.06337571774895, "val/loss_avg_len_512": 5.974755134170968, "val/perplexity_len_512": 393.37176599339256}
6
- {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1283.0319452389958, "val/train_update_time": 831.1140817146515, "val/loss": 5.738160552069335, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.04604214400752, "val/val_tokens_per_second": 454878.40469983226, "val/loss_avg_len_2048": 5.738160552069335, "val/perplexity_len_2048": 310.4927501256864, "val/loss_avg_len_1024": 5.742955871155067, "val/perplexity_len_1024": 311.9852375530798, "val/loss_avg_len_512": 5.7549814947385345, "val/perplexity_len_512": 315.75970425425226}
7
- {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1511.2121625279542, "val/train_update_time": 969.0007961746887, "val/loss": 5.5474918597602985, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.03976762795355, "val/val_tokens_per_second": 454910.1033806272, "val/loss_avg_len_2048": 5.5474918597602985, "val/perplexity_len_2048": 256.5931764754379, "val/loss_avg_len_1024": 5.553597456831346, "val/perplexity_len_1024": 258.1646234514658, "val/loss_avg_len_512": 5.567558555799723, "val/perplexity_len_512": 261.79416253348427}
8
- {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1739.393865599006, "val/train_update_time": 1106.8950568859, "val/loss": 5.394616223489982, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.06370893697022, "val/val_tokens_per_second": 454789.17627815285, "val/loss_avg_len_2048": 5.394616223489982, "val/perplexity_len_2048": 220.21761652879357, "val/loss_avg_len_1024": 5.4015997172784065, "val/perplexity_len_1024": 221.7608873249335, "val/loss_avg_len_512": 5.416529589198996, "val/perplexity_len_512": 225.0965877994106}
9
- {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1967.6012063919916, "val/train_update_time": 1244.7901415458764, "val/loss": 5.258616862811218, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.39274938497692, "val/val_tokens_per_second": 453133.6891364371, "val/loss_avg_len_2048": 5.258616862811218, "val/perplexity_len_2048": 192.21544701876132, "val/loss_avg_len_1024": 5.26661902688928, "val/perplexity_len_1024": 193.7597572352443, "val/loss_avg_len_512": 5.283242479863857, "val/perplexity_len_512": 197.00763411971045}
10
- {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2196.1893687059637, "val/train_update_time": 1382.727176492801, "val/loss": 5.150326416279446, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44550614495529, "val/val_tokens_per_second": 452869.37677538325, "val/loss_avg_len_2048": 5.150326416279446, "val/perplexity_len_2048": 172.4877839494621, "val/loss_avg_len_1024": 5.159069892106625, "val/perplexity_len_1024": 174.00253918175162, "val/loss_avg_len_512": 5.1770923202755865, "val/perplexity_len_512": 177.16691667765645}
11
- {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2425.217230288952, "val/train_update_time": 1520.627368493646, "val/loss": 5.060396130196401, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.14630079700146, "val/val_tokens_per_second": 454372.49934677803, "val/loss_avg_len_2048": 5.060396130196401, "val/perplexity_len_2048": 157.65295505166543, "val/loss_avg_len_1024": 5.069972552723344, "val/perplexity_len_1024": 159.16995850145472, "val/loss_avg_len_512": 5.089300132444315, "val/perplexity_len_512": 162.27625041885165}
12
- {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2653.526103938988, "val/train_update_time": 1658.537395758729, "val/loss": 4.9848958040447675, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9463361579692, "val/val_tokens_per_second": 455382.63980050915, "val/loss_avg_len_2048": 4.9848958040447675, "val/perplexity_len_2048": 146.18834199862263, "val/loss_avg_len_1024": 4.9951241761227605, "val/perplexity_len_1024": 147.69128397585118, "val/loss_avg_len_512": 5.015488221790642, "val/perplexity_len_512": 150.72970833293965}
13
- {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2881.6501679039793, "val/train_update_time": 1796.4439407095779, "val/loss": 4.91457557297845, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.10853785695508, "val/val_tokens_per_second": 454562.9190546063, "val/loss_avg_len_2048": 4.91457557297845, "val/perplexity_len_2048": 136.26146448033256, "val/loss_avg_len_1024": 4.925280669786549, "val/perplexity_len_1024": 137.72799230477034, "val/loss_avg_len_512": 4.946453812780138, "val/perplexity_len_512": 140.67521768921586}
14
- {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3109.932256244996, "val/train_update_time": 1934.345054808713, "val/loss": 4.858456944823754, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.96858381299535, "val/val_tokens_per_second": 455270.03164946573, "val/loss_avg_len_2048": 4.858456944823754, "val/perplexity_len_2048": 128.8252641700353, "val/loss_avg_len_1024": 4.869785673670238, "val/perplexity_len_1024": 130.29298868563157, "val/loss_avg_len_512": 4.8919036218861125, "val/perplexity_len_512": 133.20690843461975}
15
- {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3338.075166533992, "val/train_update_time": 2072.2588375147316, "val/loss": 4.805626461642539, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.97877326997695, "val/val_tokens_per_second": 455218.47555202275, "val/loss_avg_len_2048": 4.805626461642539, "val/perplexity_len_2048": 122.1960181662709, "val/loss_avg_len_1024": 4.817329806529312, "val/perplexity_len_1024": 123.63452154215173, "val/loss_avg_len_512": 4.840296959608327, "val/perplexity_len_512": 126.50691359617247}
16
- {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3566.6879564279807, "val/train_update_time": 2210.160061070579, "val/loss": 4.758956886007242, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.99822524102638, "val/val_tokens_per_second": 455120.0858717386, "val/loss_avg_len_2048": 4.758956886007242, "val/perplexity_len_2048": 116.62421008285914, "val/loss_avg_len_1024": 4.771169604997524, "val/perplexity_len_1024": 118.05724158216387, "val/loss_avg_len_512": 4.794990886492469, "val/perplexity_len_512": 120.90327992508134}
17
- {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3794.8868216549745, "val/train_update_time": 2348.075175291684, "val/loss": 4.716963442835352, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.26495351700578, "val/val_tokens_per_second": 453775.22952231066, "val/loss_avg_len_2048": 4.716963442835352, "val/perplexity_len_2048": 111.82816396723561, "val/loss_avg_len_1024": 4.72980757133565, "val/perplexity_len_1024": 113.27376313238463, "val/loss_avg_len_512": 4.75441891277507, "val/perplexity_len_512": 116.0961715577927}
18
- {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4023.3350287670037, "val/train_update_time": 2485.9809508775943, "val/loss": 4.678447687354149, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28331762104062, "val/val_tokens_per_second": 453682.9292420046, "val/loss_avg_len_2048": 4.678447687354149, "val/perplexity_len_2048": 107.60290950314338, "val/loss_avg_len_1024": 4.692047001610324, "val/perplexity_len_1024": 109.07623065660846, "val/loss_avg_len_512": 4.717900191090349, "val/perplexity_len_512": 111.93296788447095}
19
- {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4251.801609379996, "val/train_update_time": 2623.8855139956577, "val/loss": 4.638916808897607, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0385509430198, "val/val_tokens_per_second": 454916.25055051385, "val/loss_avg_len_2048": 4.638916808897607, "val/perplexity_len_2048": 103.43224998979402, "val/loss_avg_len_1024": 4.652840973052616, "val/perplexity_len_1024": 104.88253116221422, "val/loss_avg_len_512": 4.679233000243456, "val/perplexity_len_512": 107.68744464382641}
20
- {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4480.02484513697, "val/train_update_time": 2761.798597707704, "val/loss": 4.607533364184201, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.03529246797552, "val/val_tokens_per_second": 454932.71446381963, "val/loss_avg_len_2048": 4.607533364184201, "val/perplexity_len_2048": 100.2365972702575, "val/loss_avg_len_1024": 4.622161411953904, "val/perplexity_len_1024": 101.71363978965609, "val/loss_avg_len_512": 4.649712271768321, "val/perplexity_len_512": 104.55489785290531}
21
- {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4708.671424973989, "val/train_update_time": 2899.7122086867457, "val/loss": 4.5756238960157845, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.96249285701197, "val/val_tokens_per_second": 455300.85593673546, "val/loss_avg_len_2048": 4.5756238960157845, "val/perplexity_len_2048": 97.08859343179769, "val/loss_avg_len_1024": 4.590941229291865, "val/perplexity_len_1024": 98.58717964711914, "val/loss_avg_len_512": 4.6193589118688365, "val/perplexity_len_512": 101.42898636242586}
22
- {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4936.824467270984, "val/train_update_time": 3037.629162015859, "val/loss": 4.546358786865138, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.17431224201573, "val/val_tokens_per_second": 454231.3546020608, "val/loss_avg_len_2048": 4.546358786865138, "val/perplexity_len_2048": 94.28845812477917, "val/loss_avg_len_1024": 4.562232828937704, "val/perplexity_len_1024": 95.79713983415002, "val/loss_avg_len_512": 4.591761249877047, "val/perplexity_len_512": 98.66805631960091}
23
- {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5165.18791128695, "val/train_update_time": 3175.5471031158813, "val/loss": 4.517468703701883, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98698865802726, "val/val_tokens_per_second": 455176.91625017143, "val/loss_avg_len_2048": 4.517468703701883, "val/perplexity_len_2048": 91.60342883737046, "val/loss_avg_len_1024": 4.533819358161278, "val/perplexity_len_1024": 93.11351666917558, "val/loss_avg_len_512": 4.564258050953411, "val/perplexity_len_512": 95.99134690061845}
24
- {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5393.351182107988, "val/train_update_time": 3313.452428144694, "val/loss": 4.492371192065021, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.90288911398966, "val/val_tokens_per_second": 455602.71092140325, "val/loss_avg_len_2048": 4.492371192065021, "val/perplexity_len_2048": 89.33302068707025, "val/loss_avg_len_1024": 4.509291414985387, "val/perplexity_len_1024": 90.85741549263298, "val/loss_avg_len_512": 4.540741394605115, "val/perplexity_len_512": 93.76028772891796}
25
- {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5621.47650762595, "val/train_update_time": 3451.396044731722, "val/loss": 4.466965344157769, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.90857521002181, "val/val_tokens_per_second": 455573.8971985658, "val/loss_avg_len_2048": 4.466965344157769, "val/perplexity_len_2048": 87.0920272569869, "val/loss_avg_len_1024": 4.484603051171778, "val/perplexity_len_1024": 88.64175758666217, "val/loss_avg_len_512": 4.517253230262828, "val/perplexity_len_512": 91.58369285789568}
26
- {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5850.004313221958, "val/train_update_time": 3589.3208626466803, "val/loss": 4.443324041806534, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.94098925299477, "val/val_tokens_per_second": 455409.71185878024, "val/loss_avg_len_2048": 4.443324041806534, "val/perplexity_len_2048": 85.05720599339685, "val/loss_avg_len_1024": 4.461723275988643, "val/perplexity_len_1024": 86.63667943799094, "val/loss_avg_len_512": 4.495576438648999, "val/perplexity_len_512": 89.61981442325171}
27
- {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6078.168775815982, "val/train_update_time": 3727.2589657856734, "val/loss": 4.420480709320446, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0520894320216, "val/val_tokens_per_second": 454847.8581490308, "val/loss_avg_len_2048": 4.420480709320446, "val/perplexity_len_2048": 83.1362401197537, "val/loss_avg_len_1024": 4.4397112005579284, "val/perplexity_len_1024": 84.75046225296263, "val/loss_avg_len_512": 4.474702652350906, "val/perplexity_len_512": 87.76849877254239}
28
- {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6306.462939933001, "val/train_update_time": 3865.200992291735, "val/loss": 4.397374363187957, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.00921994104283, "val/val_tokens_per_second": 455064.4925800859, "val/loss_avg_len_2048": 4.397374363187957, "val/perplexity_len_2048": 81.23728878013355, "val/loss_avg_len_1024": 4.4172204922693314, "val/perplexity_len_1024": 82.86563928056967, "val/loss_avg_len_512": 4.4534774074878545, "val/perplexity_len_512": 85.92522209509893}
29
- {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6534.709841756965, "val/train_update_time": 4003.142038117745, "val/loss": 4.375490610505734, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.2145028290106, "val/val_tokens_per_second": 454028.994402753, "val/loss_avg_len_2048": 4.375490610505734, "val/perplexity_len_2048": 79.47882313421499, "val/loss_avg_len_1024": 4.396335411125421, "val/perplexity_len_1024": 81.15293096085392, "val/loss_avg_len_512": 4.434476591604389, "val/perplexity_len_512": 84.30798583032089}
30
- {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6763.143532235001, "val/train_update_time": 4141.075749416603, "val/loss": 4.3550234247615105, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9800370450248, "val/val_tokens_per_second": 455212.08198107505, "val/loss_avg_len_2048": 4.3550234247615105, "val/perplexity_len_2048": 77.86864937000485, "val/loss_avg_len_1024": 4.376727943733195, "val/perplexity_len_1024": 79.5772257888411, "val/loss_avg_len_512": 4.416302177713905, "val/perplexity_len_512": 82.78957748754466}
31
- {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6991.763516802981, "val/train_update_time": 4279.012207661464, "val/loss": 4.3350345812852265, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.01144512498286, "val/val_tokens_per_second": 455053.2428751271, "val/loss_avg_len_2048": 4.3350345812852265, "val/perplexity_len_2048": 76.32759835057868, "val/loss_avg_len_1024": 4.357916226750007, "val/perplexity_len_1024": 78.09423408238275, "val/loss_avg_len_512": 4.399376267791725, "val/perplexity_len_512": 81.40008097539523}
32
- {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7219.974368761992, "val/train_update_time": 4416.951972602401, "val/loss": 4.3158074309751395, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51169959100662, "val/val_tokens_per_second": 452538.18219175114, "val/loss_avg_len_2048": 4.3158074309751395, "val/perplexity_len_2048": 74.87405467461461, "val/loss_avg_len_1024": 4.339516162476549, "val/perplexity_len_1024": 76.67043432955377, "val/loss_avg_len_512": 4.382426561977342, "val/perplexity_len_512": 80.03200058280369}
33
- {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7448.697525598982, "val/train_update_time": 4554.898137903423, "val/loss": 4.298591563395016, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.11004326399416, "val/val_tokens_per_second": 454555.3249819229, "val/loss_avg_len_2048": 4.298591563395016, "val/perplexity_len_2048": 73.59606527697034, "val/loss_avg_len_1024": 4.323124243415986, "val/perplexity_len_1024": 75.42390320559682, "val/loss_avg_len_512": 4.367459438038338, "val/perplexity_len_512": 78.8430713314537}
34
- {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7677.034741098003, "val/train_update_time": 4692.8429151014425, "val/loss": 4.281831383403647, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.02878875797614, "val/val_tokens_per_second": 454965.5789562217, "val/loss_avg_len_2048": 4.281831383403647, "val/perplexity_len_2048": 72.37286117017557, "val/loss_avg_len_1024": 4.307208411879139, "val/perplexity_len_1024": 74.23297154202513, "val/loss_avg_len_512": 4.353032884626277, "val/perplexity_len_512": 77.71380286337892}
35
- {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 7905.308589838969, "val/train_update_time": 4830.788480303425, "val/loss": 4.26693708552008, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98624766798457, "val/val_tokens_per_second": 455180.6643958197, "val/loss_avg_len_2048": 4.26693708552008, "val/perplexity_len_2048": 71.30290611174128, "val/loss_avg_len_1024": 4.293061920919362, "val/perplexity_len_1024": 73.19022844875921, "val/loss_avg_len_512": 4.340169534767326, "val/perplexity_len_512": 76.72054503555724}
36
- {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8133.938365876966, "val/train_update_time": 4968.716519999434, "val/loss": 4.252917671704688, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9889751879964, "val/val_tokens_per_second": 455166.86810167873, "val/loss_avg_len_2048": 4.252917671704688, "val/perplexity_len_2048": 70.31025561202095, "val/loss_avg_len_1024": 4.2801831485737125, "val/perplexity_len_1024": 72.25367195254603, "val/loss_avg_len_512": 4.329094496766571, "val/perplexity_len_512": 75.87554989747439}
37
- {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8362.121594669996, "val/train_update_time": 5106.633266707533, "val/loss": 4.240213508293801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.04761916899588, "val/val_tokens_per_second": 454870.4383080775, "val/loss_avg_len_2048": 4.240213508293801, "val/perplexity_len_2048": 69.42267257289618, "val/loss_avg_len_1024": 4.268120715982979, "val/perplexity_len_1024": 71.3873523702314, "val/loss_avg_len_512": 4.318212201548182, "val/perplexity_len_512": 75.05432626701605}
38
- {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8590.419437660952, "val/train_update_time": 5244.595247996622, "val/loss": 4.229185012385133, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9895013620262, "val/val_tokens_per_second": 455164.2067136102, "val/loss_avg_len_2048": 4.229185012385133, "val/perplexity_len_2048": 68.66125129565661, "val/loss_avg_len_1024": 4.2580083735182885, "val/perplexity_len_1024": 70.66909675946545, "val/loss_avg_len_512": 4.309627850557212, "val/perplexity_len_512": 74.41279110805154}
39
- {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 8818.629101625993, "val/train_update_time": 5382.53907933255, "val/loss": 4.219223060651217, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98893610399682, "val/val_tokens_per_second": 455167.06578977744, "val/loss_avg_len_2048": 4.219223060651217, "val/perplexity_len_2048": 67.98064692681132, "val/loss_avg_len_1024": 4.248455268475786, "val/perplexity_len_1024": 69.9972019057752, "val/loss_avg_len_512": 4.300836350739189, "val/perplexity_len_512": 73.76145835958208}
40
- {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9046.829455698957, "val/train_update_time": 5520.4718770905165, "val/loss": 4.21085463935493, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0832858220092, "val/val_tokens_per_second": 454690.3415682538, "val/loss_avg_len_2048": 4.21085463935493, "val/perplexity_len_2048": 67.4141299657516, "val/loss_avg_len_1024": 4.240732025450281, "val/perplexity_len_1024": 69.45867875377851, "val/loss_avg_len_512": 4.294237298525683, "val/perplexity_len_512": 73.27630518070133}
41
- {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9275.580202061974, "val/train_update_time": 5658.411268384545, "val/loss": 4.203232686660252, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.94337259500753, "val/val_tokens_per_second": 455397.644298181, "val/loss_avg_len_2048": 4.203232686660252, "val/perplexity_len_2048": 66.90225587432732, "val/loss_avg_len_1024": 4.233364781867806, "val/perplexity_len_1024": 68.94884010724084, "val/loss_avg_len_512": 4.287355207336601, "val/perplexity_len_512": 72.77374229184319}
42
- {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9503.765275373997, "val/train_update_time": 5796.35883763741, "val/loss": 4.197044236282748, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.93833985400852, "val/val_tokens_per_second": 455423.127294632, "val/loss_avg_len_2048": 4.197044236282748, "val/perplexity_len_2048": 66.48951302027224, "val/loss_avg_len_1024": 4.227541121233999, "val/perplexity_len_1024": 68.5484723957413, "val/loss_avg_len_512": 4.282135505998321, "val/perplexity_len_512": 72.3948747397409}
43
- {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 9731.945241109002, "val/train_update_time": 5934.31139906036, "val/loss": 4.192076116132666, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.10962512000697, "val/val_tokens_per_second": 454557.434296835, "val/loss_avg_len_2048": 4.192076116132666, "val/perplexity_len_2048": 66.1600043280041, "val/loss_avg_len_1024": 4.22289962515519, "val/perplexity_len_1024": 68.23104217534896, "val/loss_avg_len_512": 4.278101034266222, "val/perplexity_len_512": 72.10338805682221}
44
- {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 9960.290040618973, "val/train_update_time": 6072.254921466229, "val/loss": 4.188129460239923, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98181012499845, "val/val_tokens_per_second": 455203.1120856573, "val/loss_avg_len_2048": 4.188129460239923, "val/perplexity_len_2048": 65.89940813706113, "val/loss_avg_len_1024": 4.219077997006941, "val/perplexity_len_1024": 67.97078612166737, "val/loss_avg_len_512": 4.274576161771455, "val/perplexity_len_512": 71.84968021406696}
45
- {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10188.515749696002, "val/train_update_time": 6210.206845312321, "val/loss": 4.185181051009335, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98875552398385, "val/val_tokens_per_second": 455167.9791713901, "val/loss_avg_len_2048": 4.185181051009335, "val/perplexity_len_2048": 65.70539586814938, "val/loss_avg_len_1024": 4.216392289251415, "val/perplexity_len_1024": 67.78848137245132, "val/loss_avg_len_512": 4.2723089719016105, "val/perplexity_len_512": 71.68696786652427}
46
- {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10417.185683044954, "val/train_update_time": 6348.16824121651, "val/loss": 4.183063325667009, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.06988018500851, "val/val_tokens_per_second": 454758.01584132115, "val/loss_avg_len_2048": 4.183063325667009, "val/perplexity_len_2048": 65.56639711872695, "val/loss_avg_len_1024": 4.214440592087014, "val/perplexity_len_1024": 67.65630780891767, "val/loss_avg_len_512": 4.270656678032875, "val/perplexity_len_512": 71.56861773057756}
47
- {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 10645.485156638955, "val/train_update_time": 6486.112916803453, "val/loss": 4.18173328043411, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.03489440103294, "val/val_tokens_per_second": 454934.7258359208, "val/loss_avg_len_2048": 4.18173328043411, "val/perplexity_len_2048": 65.47924881324224, "val/loss_avg_len_1024": 4.21317458131751, "val/perplexity_len_1024": 67.57070839093234, "val/loss_avg_len_512": 4.269501390692499, "val/perplexity_len_512": 71.48598315507563}
48
- {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 10873.77756452799, "val/train_update_time": 6624.0805201563635, "val/loss": 4.1809452712302795, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.02757424901938, "val/val_tokens_per_second": 454971.7166287656, "val/loss_avg_len_2048": 4.1809452712302795, "val/perplexity_len_2048": 65.42767088712672, "val/loss_avg_len_1024": 4.212386983394856, "val/perplexity_len_1024": 67.51751079327998, "val/loss_avg_len_512": 4.268753702273779, "val/perplexity_len_512": 71.43255389008019}
49
- {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 11102.099602014001, "val/train_update_time": 6762.03791546938, "val/loss": 4.18061518369345, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24074817897053, "val/val_tokens_per_second": 453896.94596465246, "val/loss_avg_len_2048": 4.18061518369345, "val/perplexity_len_2048": 65.4060775924379, "val/loss_avg_len_1024": 4.212110854971316, "val/perplexity_len_1024": 67.49886986322706, "val/loss_avg_len_512": 4.2685156018145385, "val/perplexity_len_512": 71.4155477908546}
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 110.51794190204237, "val/train_update_time": 110.19485108501976, "val/loss": 8.072670181155205, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.9716204389697, "val/val_tokens_per_second": 409716.2756805078, "val/loss_avg_len_2048": 8.072670181155205, "val/perplexity_len_2048": 3205.6500777398833, "val/loss_avg_len_1024": 8.071338223218918, "val/perplexity_len_1024": 3201.383129006829, "val/loss_avg_len_512": 8.071987850761413, "val/perplexity_len_512": 3203.463511325176}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 317.24918574804906, "val/train_update_time": 216.73109497816768, "val/loss": 7.519811360669136, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.60113897896372, "val/val_tokens_per_second": 411240.27716842643, "val/loss_avg_len_2048": 7.519811360669136, "val/perplexity_len_2048": 1844.219368930416, "val/loss_avg_len_1024": 7.520807962584495, "val/perplexity_len_1024": 1846.058237643676, "val/loss_avg_len_512": 7.525793135547638, "val/perplexity_len_512": 1855.2841344973567}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 523.6280801940011, "val/train_update_time": 323.28231063415296, "val/loss": 7.164581921425462, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.44210483605275, "val/val_tokens_per_second": 411897.9587925008, "val/loss_avg_len_2048": 7.164581921425462, "val/perplexity_len_2048": 1292.8209870442263, "val/loss_avg_len_1024": 7.166963593649864, "val/perplexity_len_1024": 1295.9037324675646, "val/loss_avg_len_512": 7.175143612968922, "val/perplexity_len_512": 1306.5477247144242}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 729.8428020050051, "val/train_update_time": 429.83724463917315, "val/loss": 6.865911299175769, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.848096723028, "val/val_tokens_per_second": 406155.409283466, "val/loss_avg_len_2048": 6.865911299175769, "val/perplexity_len_2048": 959.0193952924727, "val/loss_avg_len_1024": 6.87047398582399, "val/perplexity_len_1024": 963.4050979677779, "val/loss_avg_len_512": 6.882724252340198, "val/perplexity_len_512": 975.2796519063501}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 937.4650811910396, "val/train_update_time": 536.3882422860479, "val/loss": 6.6218816578798005, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.0134965860052, "val/val_tokens_per_second": 409544.7254439007, "val/loss_avg_len_2048": 6.6218816578798005, "val/perplexity_len_2048": 751.3575656878406, "val/loss_avg_len_1024": 6.628069533909858, "val/perplexity_len_1024": 756.0212875438285, "val/loss_avg_len_512": 6.643743345025182, "val/perplexity_len_512": 767.9643747482285}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1144.6911778100184, "val/train_update_time": 642.9422557381331, "val/loss": 6.436701403411478, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.36018703901209, "val/val_tokens_per_second": 408129.96875023755, "val/loss_avg_len_2048": 6.436701403411478, "val/perplexity_len_2048": 624.3439406192032, "val/loss_avg_len_1024": 6.444007082933933, "val/perplexity_len_1024": 628.9218995499804, "val/loss_avg_len_512": 6.4616794860377915, "val/perplexity_len_512": 640.1352524873843}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1351.811079526029, "val/train_update_time": 749.4968144011218, "val/loss": 6.28673181735389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.59608248295262, "val/val_tokens_per_second": 411261.15584928676, "val/loss_avg_len_2048": 6.28673181735389, "val/perplexity_len_2048": 537.3941537499013, "val/loss_avg_len_1024": 6.294847987662257, "val/perplexity_len_1024": 541.7734838793829, "val/loss_avg_len_512": 6.313898871052266, "val/perplexity_len_512": 552.1936892494012}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1558.167400816048, "val/train_update_time": 856.0381595880608, "val/loss": 6.1621321680786085, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.55671566096134, "val/val_tokens_per_second": 411423.7771712816, "val/loss_avg_len_2048": 6.1621321680786085, "val/perplexity_len_2048": 474.43857996407974, "val/loss_avg_len_1024": 6.1707096502751115, "val/perplexity_len_1024": 478.52557142759065, "val/loss_avg_len_512": 6.190553567818553, "val/perplexity_len_512": 488.11623688017033}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1764.4956908360473, "val/train_update_time": 962.5948072728934, "val/loss": 6.043532439414506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.50245448201895, "val/val_tokens_per_second": 411648.13685477345, "val/loss_avg_len_2048": 6.043532439414506, "val/perplexity_len_2048": 421.37890441996467, "val/loss_avg_len_1024": 6.052711034156941, "val/perplexity_len_1024": 425.2643749180878, "val/loss_avg_len_512": 6.073728867790103, "val/perplexity_len_512": 434.2971024683484}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1970.7506705410196, "val/train_update_time": 1069.132912081608, "val/loss": 5.946526206344739, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.66884558799211, "val/val_tokens_per_second": 410960.9152023205, "val/loss_avg_len_2048": 5.946526206344739, "val/perplexity_len_2048": 382.4225718901975, "val/loss_avg_len_1024": 5.955733354584128, "val/perplexity_len_1024": 385.95985234406163, "val/loss_avg_len_512": 5.977101679090039, "val/perplexity_len_512": 394.2959143659733}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2177.6055200890405, "val/train_update_time": 1175.6836349036312, "val/loss": 5.859628180498444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.70184189802967, "val/val_tokens_per_second": 410824.9077473609, "val/loss_avg_len_2048": 5.859628180498444, "val/perplexity_len_2048": 350.59376218425683, "val/loss_avg_len_1024": 5.869486711973604, "val/perplexity_len_1024": 354.06719516588015, "val/loss_avg_len_512": 5.89188057346195, "val/perplexity_len_512": 362.0855730249886}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2384.0519924180117, "val/train_update_time": 1282.2238651026273, "val/loss": 5.783013271738776, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.6605639460031, "val/val_tokens_per_second": 410995.0654322251, "val/loss_avg_len_2048": 5.783013271738776, "val/perplexity_len_2048": 324.7362361487591, "val/loss_avg_len_1024": 5.79357122720005, "val/perplexity_len_1024": 328.1829499750509, "val/loss_avg_len_512": 5.816968626810331, "val/perplexity_len_512": 335.95211227350995}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2590.4739345350536, "val/train_update_time": 1388.759745098534, "val/loss": 5.716440834625299, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.56407831504475, "val/val_tokens_per_second": 411393.3528354743, "val/loss_avg_len_2048": 5.716440834625299, "val/perplexity_len_2048": 303.8216448390781, "val/loss_avg_len_1024": 5.72740313000246, "val/perplexity_len_1024": 307.17054975726995, "val/loss_avg_len_512": 5.751391485624389, "val/perplexity_len_512": 314.62815638697555}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2796.791148387012, "val/train_update_time": 1495.3029013883206, "val/loss": 5.649823216465559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.50496849097544, "val/val_tokens_per_second": 411637.73649870406, "val/loss_avg_len_2048": 5.649823216465559, "val/perplexity_len_2048": 284.24121221591713, "val/loss_avg_len_1024": 5.6613804133704875, "val/perplexity_len_1024": 287.5453000942534, "val/loss_avg_len_512": 5.686038017921988, "val/perplexity_len_512": 294.7236147202462}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3003.058338998002, "val/train_update_time": 1601.8528411513544, "val/loss": 5.595519945517543, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.44042880897177, "val/val_tokens_per_second": 411904.9011613321, "val/loss_avg_len_2048": 5.595519945517543, "val/perplexity_len_2048": 269.21759218828305, "val/loss_avg_len_1024": 5.607140162670961, "val/perplexity_len_1024": 272.36420583089864, "val/loss_avg_len_512": 5.632021551703382, "val/perplexity_len_512": 279.2260172957314}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3209.7035325120087, "val/train_update_time": 1708.4027871834696, "val/loss": 5.542625576345867, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.4260749219684, "val/val_tokens_per_second": 411964.3668136979, "val/loss_avg_len_2048": 5.542625576345867, "val/perplexity_len_2048": 255.34755458916246, "val/loss_avg_len_1024": 5.554492764933896, "val/perplexity_len_1024": 258.39586383088005, "val/loss_avg_len_512": 5.579607635878865, "val/perplexity_len_512": 264.9676215997508}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3415.8845459170407, "val/train_update_time": 1814.9587236176012, "val/loss": 5.495897741303175, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.34644473099615, "val/val_tokens_per_second": 404158.23276998143, "val/loss_avg_len_2048": 5.495897741303175, "val/perplexity_len_2048": 243.69019874687106, "val/loss_avg_len_1024": 5.50820064414118, "val/perplexity_len_1024": 246.7068140975722, "val/loss_avg_len_512": 5.5340119562351155, "val/perplexity_len_512": 253.1575333040166}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 3623.980893074011, "val/train_update_time": 1921.4990626386134, "val/loss": 5.454193331815151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.68043452501297, "val/val_tokens_per_second": 402830.6939416551, "val/loss_avg_len_2048": 5.454193331815151, "val/perplexity_len_2048": 233.73624740691568, "val/loss_avg_len_1024": 5.466780889385893, "val/perplexity_len_1024": 236.69701117624166, "val/loss_avg_len_512": 5.492807074442693, "val/perplexity_len_512": 242.9381962186342}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 3832.4348147350247, "val/train_update_time": 2028.0436954226461, "val/loss": 5.414499440166541, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.82433698000386, "val/val_tokens_per_second": 406251.12177155656, "val/loss_avg_len_2048": 5.414499440166541, "val/perplexity_len_2048": 224.64007173409922, "val/loss_avg_len_1024": 5.427113480050698, "val/perplexity_len_1024": 227.4916376312745, "val/loss_avg_len_512": 5.453149790130276, "val/perplexity_len_512": 233.4924611121837}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4040.0474286440294, "val/train_update_time": 2134.611642122676, "val/loss": 5.380052010235644, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.62901382899145, "val/val_tokens_per_second": 411125.2177032077, "val/loss_avg_len_2048": 5.380052010235644, "val/perplexity_len_2048": 217.03356309816712, "val/loss_avg_len_1024": 5.393027071399452, "val/perplexity_len_1024": 219.8679351651408, "val/loss_avg_len_512": 5.4193882748374715, "val/perplexity_len_512": 225.74098881334308}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4246.9178847110015, "val/train_update_time": 2241.15703301772, "val/loss": 5.349708102982907, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.11544794699876, "val/val_tokens_per_second": 409127.6705037995, "val/loss_avg_len_2048": 5.349708102982907, "val/perplexity_len_2048": 210.5468309042001, "val/loss_avg_len_1024": 5.362598578461824, "val/perplexity_len_1024": 213.27844776051109, "val/loss_avg_len_512": 5.388854680254846, "val/perplexity_len_512": 218.95247130422538}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4453.811728182016, "val/train_update_time": 2347.708159423666, "val/loss": 5.3179078935331665, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.09129549999489, "val/val_tokens_per_second": 405178.3073647728, "val/loss_avg_len_2048": 5.3179078935331665, "val/perplexity_len_2048": 203.9567361248487, "val/loss_avg_len_1024": 5.331117789819115, "val/perplexity_len_1024": 206.6688574375496, "val/loss_avg_len_512": 5.357798411159706, "val/perplexity_len_512": 212.2571287414731}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 4661.668510478048, "val/train_update_time": 2454.2510100168292, "val/loss": 5.291644628655117, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.32744767802069, "val/val_tokens_per_second": 404234.0050857196, "val/loss_avg_len_2048": 5.291644628655117, "val/perplexity_len_2048": 198.66989507746274, "val/loss_avg_len_1024": 5.304906864394906, "val/perplexity_len_1024": 201.32225127081918, "val/loss_avg_len_512": 5.331710340877599, "val/perplexity_len_512": 206.79135557739693}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 4869.772359199007, "val/train_update_time": 2560.802637038869, "val/loss": 5.267185813411148, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.22516736399848, "val/val_tokens_per_second": 404642.45272829, "val/loss_avg_len_2048": 5.267185813411148, "val/perplexity_len_2048": 193.86960878237306, "val/loss_avg_len_1024": 5.280538284004224, "val/perplexity_len_1024": 196.47560656877008, "val/loss_avg_len_512": 5.307623009739281, "val/perplexity_len_512": 201.8698150611123}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5077.749329403043, "val/train_update_time": 2667.341171991662, "val/loss": 5.244305751117928, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.12790367001435, "val/val_tokens_per_second": 405031.6333428074, "val/loss_avg_len_2048": 5.244305751117928, "val/perplexity_len_2048": 189.48422034685896, "val/loss_avg_len_1024": 5.257994546739722, "val/perplexity_len_1024": 192.095865469511, "val/loss_avg_len_512": 5.285545470022318, "val/perplexity_len_512": 197.46186360448596}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5286.093386778026, "val/train_update_time": 2773.886181908485, "val/loss": 5.224113088942878, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.72626927896636, "val/val_tokens_per_second": 402649.1907186179, "val/loss_avg_len_2048": 5.224113088942878, "val/perplexity_len_2048": 185.69640127735994, "val/loss_avg_len_1024": 5.2377807637095275, "val/perplexity_len_1024": 188.25186312502768, "val/loss_avg_len_512": 5.265193086130079, "val/perplexity_len_512": 193.48366419285284}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 5494.618768353015, "val/train_update_time": 2880.4332658784697, "val/loss": 5.204351608313579, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.6204348889878, "val/val_tokens_per_second": 403068.53680310975, "val/loss_avg_len_2048": 5.204351608313579, "val/perplexity_len_2048": 182.0627864926695, "val/loss_avg_len_1024": 5.218173131045303, "val/perplexity_len_1024": 184.59664196761443, "val/loss_avg_len_512": 5.245799912956764, "val/perplexity_len_512": 189.76755205696549}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 5703.016606429999, "val/train_update_time": 2986.9915024373913, "val/loss": 5.187859910127178, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.53107214701595, "val/val_tokens_per_second": 407436.2197202113, "val/loss_avg_len_2048": 5.187859910127178, "val/perplexity_len_2048": 179.08488478780052, "val/loss_avg_len_1024": 5.201843161700579, "val/perplexity_len_1024": 181.6066640311323, "val/loss_avg_len_512": 5.229689651350794, "val/perplexity_len_512": 186.7348416205781}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 5910.318671693036, "val/train_update_time": 3093.5336661074543, "val/loss": 5.1715570674947635, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.04047837899998, "val/val_tokens_per_second": 409434.2676453867, "val/loss_avg_len_2048": 5.1715570674947635, "val/perplexity_len_2048": 176.18896211925417, "val/loss_avg_len_1024": 5.185615149655531, "val/perplexity_len_1024": 178.68333297978904, "val/loss_avg_len_512": 5.213619250194659, "val/perplexity_len_512": 183.75792201945603}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6117.130997166038, "val/train_update_time": 3200.0810677845147, "val/loss": 5.157462475304946, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.89286627602996, "val/val_tokens_per_second": 410039.2903615046, "val/loss_avg_len_2048": 5.157462475304946, "val/perplexity_len_2048": 173.72306924924297, "val/loss_avg_len_1024": 5.171683135021537, "val/perplexity_len_1024": 176.2111752260996, "val/loss_avg_len_512": 5.199810537778202, "val/perplexity_len_512": 181.23790088671788}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6324.271360302053, "val/train_update_time": 3306.647373120475, "val/loss": 5.143889664166529, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.14473170100246, "val/val_tokens_per_second": 409008.0357126763, "val/loss_avg_len_2048": 5.143889664166529, "val/perplexity_len_2048": 171.38108842507802, "val/loss_avg_len_1024": 5.15810240709968, "val/perplexity_len_1024": 173.83427574322482, "val/loss_avg_len_512": 5.18635319549219, "val/perplexity_len_512": 178.81525814719697}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 6531.185974933032, "val/train_update_time": 3413.19108713849, "val/loss": 5.131008138038125, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.94807949004462, "val/val_tokens_per_second": 409812.7768836203, "val/loss_avg_len_2048": 5.131008138038125, "val/perplexity_len_2048": 169.1875965491763, "val/loss_avg_len_1024": 5.145431026424136, "val/perplexity_len_1024": 171.6454524544731, "val/loss_avg_len_512": 5.173947728090605, "val/perplexity_len_512": 176.61067401256858}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 6737.915407613036, "val/train_update_time": 3519.7321525084553, "val/loss": 5.120392005993269, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.73520365200238, "val/val_tokens_per_second": 410687.4854631898, "val/loss_avg_len_2048": 5.120392005993269, "val/perplexity_len_2048": 167.40097894766583, "val/loss_avg_len_1024": 5.13483073173333, "val/perplexity_len_1024": 169.83556966900616, "val/loss_avg_len_512": 5.163387195022637, "val/perplexity_len_512": 174.75538481351157}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 6944.4543808570015, "val/train_update_time": 3626.2884129853337, "val/loss": 5.110702060040148, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.9236697970191, "val/val_tokens_per_second": 409912.8873389507, "val/loss_avg_len_2048": 5.110702060040148, "val/perplexity_len_2048": 165.78670624776979, "val/loss_avg_len_1024": 5.125157545317412, "val/perplexity_len_1024": 168.20063878756756, "val/loss_avg_len_512": 5.153955277375667, "val/perplexity_len_512": 173.11485524738933}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 7151.144483595039, "val/train_update_time": 3732.833151328552, "val/loss": 5.1014151430890955, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.96349205297884, "val/val_tokens_per_second": 409749.59116366145, "val/loss_avg_len_2048": 5.1014151430890955, "val/perplexity_len_2048": 164.25418608357114, "val/loss_avg_len_1024": 5.116010741177417, "val/perplexity_len_1024": 166.6691552558016, "val/loss_avg_len_512": 5.1449303319406585, "val/perplexity_len_512": 171.55953203505013}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 7358.337214609026, "val/train_update_time": 3839.3859579174896, "val/loss": 5.093367775315, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.89132478396641, "val/val_tokens_per_second": 410045.61796115554, "val/loss_avg_len_2048": 5.093367775315, "val/perplexity_len_2048": 162.93767656263796, "val/loss_avg_len_1024": 5.108063309450646, "val/perplexity_len_1024": 165.34981315838286, "val/loss_avg_len_512": 5.137162590507989, "val/perplexity_len_512": 170.23206433751022}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 7565.03086849401, "val/train_update_time": 3945.9412919793394, "val/loss": 5.086384535997531, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.77674050303176, "val/val_tokens_per_second": 410516.5171110737, "val/loss_avg_len_2048": 5.086384535997531, "val/perplexity_len_2048": 161.80380742097068, "val/loss_avg_len_1024": 5.101098106556922, "val/perplexity_len_1024": 164.20211975991967, "val/loss_avg_len_512": 5.1302048265572875, "val/perplexity_len_512": 169.05174078500627}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 7771.636799781001, "val/train_update_time": 4052.503600837372, "val/loss": 5.080432867527884, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.85730355099076, "val/val_tokens_per_second": 410185.31988583424, "val/loss_avg_len_2048": 5.080432867527884, "val/perplexity_len_2048": 160.84366486138146, "val/loss_avg_len_1024": 5.09520770714046, "val/perplexity_len_1024": 163.2377467490014, "val/loss_avg_len_512": 5.124402106901945, "val/perplexity_len_512": 168.07362154641356}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 7978.290488699044, "val/train_update_time": 4159.058140857378, "val/loss": 5.0751120198699065, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.50360889302101, "val/val_tokens_per_second": 403532.4501926773, "val/loss_avg_len_2048": 5.0751120198699065, "val/perplexity_len_2048": 159.9901130472598, "val/loss_avg_len_1024": 5.08987260026416, "val/perplexity_len_1024": 162.36917494575445, "val/loss_avg_len_512": 5.11916280782083, "val/perplexity_len_512": 167.19533638390328}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 8186.5969784220215, "val/train_update_time": 4265.60545003548, "val/loss": 5.070432775841683, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.58954598399578, "val/val_tokens_per_second": 403191.0921863236, "val/loss_avg_len_2048": 5.070432775841683, "val/perplexity_len_2048": 159.24322905521015, "val/loss_avg_len_1024": 5.085260728531238, "val/perplexity_len_1024": 161.6220732303632, "val/loss_avg_len_512": 5.1146308313941, "val/perplexity_len_512": 166.43932546654347}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 8395.40680388402, "val/train_update_time": 4372.159924876352, "val/loss": 5.0664675774028645, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.43134209199343, "val/val_tokens_per_second": 403819.95500810014, "val/loss_avg_len_2048": 5.0664675774028645, "val/perplexity_len_2048": 158.6130482735774, "val/loss_avg_len_1024": 5.0813700341027355, "val/perplexity_len_1024": 160.99447282291885, "val/loss_avg_len_512": 5.110828997220903, "val/perplexity_len_512": 165.8077520805886}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 8603.637448858004, "val/train_update_time": 4478.715910169412, "val/loss": 5.063192765613369, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.8011323119863, "val/val_tokens_per_second": 410416.1851786989, "val/loss_avg_len_2048": 5.063192765613369, "val/perplexity_len_2048": 158.09446997973149, "val/loss_avg_len_1024": 5.078104476989369, "val/perplexity_len_1024": 160.46959365486623, "val/loss_avg_len_512": 5.107591516341537, "val/perplexity_len_512": 165.27182065555164}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 8810.230298971, "val/train_update_time": 4585.266904477379, "val/loss": 5.060545605122825, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.59589945495827, "val/val_tokens_per_second": 411261.91162643145, "val/loss_avg_len_2048": 5.060545605122825, "val/perplexity_len_2048": 157.67652197681062, "val/loss_avg_len_1024": 5.075487850701, "val/perplexity_len_1024": 160.0502535650356, "val/loss_avg_len_512": 5.104997573237831, "val/perplexity_len_512": 164.84367049477257}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 9016.607044072007, "val/train_update_time": 4691.8174923404, "val/loss": 5.058485259354685, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.55429247097345, "val/val_tokens_per_second": 411433.79138516303, "val/loss_avg_len_2048": 5.058485259354685, "val/perplexity_len_2048": 157.35198826265747, "val/loss_avg_len_1024": 5.073454285918526, "val/perplexity_len_1024": 159.72511171651485, "val/loss_avg_len_512": 5.103029847818229, "val/perplexity_len_512": 164.51962233753557}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 9222.955616571999, "val/train_update_time": 4798.3678903255495, "val/loss": 5.056988405886034, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.03087017301004, "val/val_tokens_per_second": 409473.594792857, "val/loss_avg_len_2048": 5.056988405886034, "val/perplexity_len_2048": 157.1166315844018, "val/loss_avg_len_1024": 5.071977700420563, "val/perplexity_len_1024": 159.48943797193047, "val/loss_avg_len_512": 5.1015830969281035, "val/perplexity_len_512": 164.2817755215082}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 9430.23481100105, "val/train_update_time": 4904.91280556639, "val/loss": 5.055763588758994, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.07011089101434, "val/val_tokens_per_second": 409313.02698973974, "val/loss_avg_len_2048": 5.055763588758994, "val/perplexity_len_2048": 156.92431024637185, "val/loss_avg_len_1024": 5.0707623975899185, "val/perplexity_len_1024": 159.29572773864462, "val/loss_avg_len_512": 5.100373116077785, "val/perplexity_len_512": 164.08311792924914}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 9637.072636537021, "val/train_update_time": 5011.441395019239, "val/loss": 5.055077503644489, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.58326662296895, "val/val_tokens_per_second": 407224.79369790596, "val/loss_avg_len_2048": 5.055077503644489, "val/perplexity_len_2048": 156.81668373770228, "val/loss_avg_len_1024": 5.070088747809251, "val/perplexity_len_1024": 159.1884543429987, "val/loss_avg_len_512": 5.099732340347279, "val/perplexity_len_512": 163.9780111280343}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 9844.444105764036, "val/train_update_time": 5117.988080174255, "val/loss": 5.054681106794774, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.61741263099248, "val/val_tokens_per_second": 411173.09633132076, "val/loss_avg_len_2048": 5.054681106794774, "val/perplexity_len_2048": 156.75453441699727, "val/loss_avg_len_1024": 5.069690365717549, "val/perplexity_len_1024": 159.12504914416917, "val/loss_avg_len_512": 5.0993228254112655, "val/perplexity_len_512": 163.9108734311827}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 10050.842236216005, "val/train_update_time": 5224.530718925176, "val/loss": 5.054502099541319, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.76580651698168, "val/val_tokens_per_second": 410561.5082962114, "val/loss_avg_len_2048": 5.054502099541319, "val/perplexity_len_2048": 156.72647672966426, "val/loss_avg_len_1024": 5.069511245464872, "val/perplexity_len_1024": 159.09654917769598, "val/loss_avg_len_512": 5.099155388283602, "val/perplexity_len_512": 163.88343096285075}
metrics/npz/train_eval/step-000000104857600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b243beae0017a1de2241d891675cf674d1db79105c4fe483dabc70ebadf9b5c
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ca2b6fd9a44859a14d30a8b8b835115a070dce0ccab094a1711a916bcdee76b
3
  size 20540
metrics/npz/train_eval/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eda57ce5fa7e74cdb9e7f0ecef0f337976b9c167dfd02431babe46d869215dfb
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efc94aaa1ae3287e6e0ecc7f4ae4f5d4c9f8a5fbf7ba1be95807a3ec73607dd6
3
  size 20540
metrics/npz/train_eval/step-000000314572800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72a68d2aa605168cb796faa852fdaca040301e592ef9fac523d5a38bd6f9d9d5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf8679ea18aa02fece3c2aee47599c098b3e7ec97a05ca9b1ca46dac8b712223
3
  size 20540
metrics/npz/train_eval/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08cefc36c5fb36c5fbb9e2d77118aa51da7c8c50ab277fe269e217ce5f068878
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d895609bd6b3c4f3a17d8069f4131872034adb440ed9810322ce88838c1c3990
3
  size 20540
metrics/npz/train_eval/step-000000524288000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d41ed5d12b8ce626320f509bbec6fb2b308fb4d50ffbff24e27cb58c85f5d34
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f8c1ca16cfdc3493df8af18fdfaaae4386ecf358be656b38d39920e6bd972e0
3
  size 20540
metrics/npz/train_eval/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cf5797d2799a6769f1e8af130bf4c75f3ea117a3c555001a4475205ae595401
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92cdfac9bbbe35185425c7e209695a6e67e906a583c716b17cd311222fd408d2
3
  size 20540
metrics/npz/train_eval/step-000000734003200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6a82b3a7e23b06c60fc4d81b7e7b4d7ec0854601e9e233610c882cfa323d417
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb65bb2177a5d1c53c6726ab856a944b566fd208c9b92e1b80dd0e03564f2eab
3
  size 20540
metrics/npz/train_eval/step-000000838860800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05967c04c01a405ee98a3c54095fc8221fdab1569c6d496e8ed034be6f065953
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aa86ff00d18908412d7f09cd7617e7a41dbd33dddb13d3e9c91630b332d1496
3
  size 20540
metrics/npz/train_eval/step-000000943718400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dab29252c1feec76660212adf00e9dbb9ff7c15cfa8e43c21543d7c75dfa54f9
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c087c1312c95a2beb753576a44d0c77fcf698d9d9bb1643ae3bb43ca6d99c908
3
  size 20540
metrics/npz/train_eval/step-000001048576000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a16448c38926d679a325916a3128a4bd64686744d129c1edd142f31bffc7809e
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94cee7bf23eb0b037070c3e131b17f61ff9256e5070bae1a58af07dccc4c1eb7
3
  size 20540
metrics/npz/train_eval/step-000001153433600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c85c15e520a1e15e450c9acfa31c3d48d6e1b3f1b7e0b618c4f1fa8b5e254f40
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0592dcf61d27ffb9891c62209eddcec508944613cde77346812c2cee129814e
3
  size 20540
metrics/npz/train_eval/step-000001258291200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a31d108a0abb8742d7cc9ffcb4ba4c2129ed63187aa68cf047784d028b81d6d2
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d7fd6cff024296d3d4e37a5568be8739860f647d958ef0cbee26959e9c6744c
3
  size 20540
metrics/npz/train_eval/step-000001363148800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1088e5d743ead044cc02a1242fce15999c2e09d326f1efebd609d36fffb18166
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7718a521eb87cfcd6dbb500a3c0b4ae5e36526ee28ab0ad519ce36d74065d1dc
3
  size 20540
metrics/npz/train_eval/step-000001468006400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58fde86d08b3ca436924d6cbfa8f61546f434e37d15643c54967356b815ddbc2
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a377760799d38ceef201c02ea112f48d12c48bfd5c2ef9194a67281b3c4b598
3
  size 20540
metrics/npz/train_eval/step-000001572864000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6da8af2424cd1c71e2140e79f20004db093563341f12adcfc564cb465b8622e
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80a518d1389fea79c6b204dd78b21fe4643e3e61b65d2104807b653b5abca991
3
  size 20540
metrics/npz/train_eval/step-000001677721600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f17bb2808d3bcb8b6ec114641a5b1c6eba99a67f4f708ed380434da29b3d10b9
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42b19cc212c4ae90e7d5a9f69978c1bda5a50ad2c4f18d814663d8ea45b0f899
3
  size 20540
metrics/npz/train_eval/step-000001782579200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:202b2385782126d619e374d0af8aaeb4277d105811f9e49b333c27a2d84db835
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf561f29428519e6f1e81f52f81e0cad663038ec83149bc48754f8b9defa9ea
3
  size 20540
metrics/npz/train_eval/step-000001887436800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6e276af19f20402d4fe0215c17746aacbab8e7206c92337e8ed7f021b57dfbb
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3939f70caf466c0a57c7e8ebbee8b34400feae0428d57911a40385f41221b52f
3
  size 20540
metrics/npz/train_eval/step-000001992294400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0da910aa3f615144aad0eeb01fa73b24c281aa4e232854be2576d0e403b5a2d8
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bd0a5bf73c23210012dde63862c968e6f053177e638cb68e029b32a1162cacc
3
  size 20540
metrics/npz/val/step-000000041943040.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08a2e31e48c5238c88ef39c6b9636efd6125fda2376e0831ab537f65a10a4202
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f9ba5fd2709899a0902dc4d605b1b4b400b141e5c98b4d02e5a00bc815f29ec
3
  size 21142
metrics/npz/val/step-000000083886080.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c233fa19005aa299a07d624d2846ff23a12b36ec67c5de95d74f2e5316a9ef71
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e60868b7de6855b5a73d630b2f82011a16b50d3c9b1e6e4a78a7170a1e7405e
3
  size 21142
metrics/npz/val/step-000000125829120.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:246ae308bf0f7a198dbc1c5adf005707cf01d4768398e0fe6fb0ea5f6f1c6ca2
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2b10b365ab893feeadacb3cefb6946b4e7714b116522d65e756a83c378920f3
3
  size 21142
metrics/npz/val/step-000000167772160.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30f2894ae58d5b0ce83ad73359c6c6d5b008ec6c51f980f1d00a02e4bc94ce47
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cdfdd1716cb177612d3a62862afba6f66eca558bd607f1a67affa956a315f00
3
  size 21142
metrics/npz/val/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ccb9ff8cbdb27229cebf69fb4ffdf211a7c42d0f783d93fdfe00360bc3136bd
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21905eca67d4de40c9080cb2c178d46bc017660a757e9f25c657a69d443ba3ec
3
  size 21142
metrics/npz/val/step-000000251658240.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56317fa4c1c78b24b52c2eb45469aea42c16c0563abb898dedd74817a800e6a2
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352b137c1b691c16a923255fa0b63b761bd51ac169370d8da45706bbb7a04c4a
3
  size 21142
metrics/npz/val/step-000000293601280.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4b3bbed2a5736bbffb524e8f9fd5d4f87c8e766836a1e853c658ca673bf345d
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf3df61f67c3b510a24a76da42144e41631b703d6f3cf777d21b58e3d925f29b
3
  size 21142
metrics/npz/val/step-000000335544320.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06ede7b5b0cff99af24483bc0b081bd9505ca27dc0f51f71be99e0057b895882
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ae4ee9a3e3af0b555a0c72a2183a19a98ad35f2da4d7e1e5efc7c24cc10e50e
3
  size 21142
metrics/npz/val/step-000000377487360.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db98329768e137719a0009cf0aa6664cf79b6fcda5d0dc413f2525ae3b449ffe
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17d945dd8424f0cd27d4b6d0d068c74c6c5b58dd33fc856fc2bdd92e38e5c7cf
3
  size 21142
metrics/npz/val/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9224b39de02f6414626fa2bfb6356d84188629c321a9dce602579b8cc8023c05
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da308e3e186476c46574c1dd01a3562d86f10052ab2ca4e917939c116c891953
3
  size 21142
metrics/npz/val/step-000000461373440.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a2c843622abec59d425d5ac0aef10b366f24baad31dd808ba6357d3bb8968aa
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c7512604a0d29b4dd95fdd2c4df34b03a293d3f1bdb0ce93fcff6a68887799a
3
  size 21142
metrics/npz/val/step-000000503316480.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2ab8e82cb4c8744493b1e554d2d9cb821406417f8cb4022daa85d61b6ca11e6
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2b21b16830d02347033e6ef4c3d740739cca8b48c7f787db45dd0e882dc6431
3
  size 21142