Lanni-ni commited on
Commit
b9a1c58
·
verified ·
1 Parent(s): 7ccd2d6

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .hydra/config.yaml +1 -1
  2. checkpoints/step-000000209715200.pt +2 -2
  3. checkpoints/step-000000419430400.pt +2 -2
  4. checkpoints/step-000000629145600.pt +2 -2
  5. checkpoints/step-000000838860800.pt +2 -2
  6. checkpoints/step-000001048576000.pt +2 -2
  7. checkpoints/step-000001258291200.pt +2 -2
  8. checkpoints/step-000001468006400.pt +2 -2
  9. checkpoints/step-000001677721600.pt +2 -2
  10. checkpoints/step-000001887436800.pt +2 -2
  11. config.yaml +1 -1
  12. decay_params.txt +19 -19
  13. logs/2025-10-27_03-37-04.log +258 -0
  14. metrics/jsonlines/checkpoint.jsonl +9 -9
  15. metrics/jsonlines/norm.jsonl +0 -0
  16. metrics/jsonlines/throughput.jsonl +0 -0
  17. metrics/jsonlines/train.jsonl +98 -98
  18. metrics/jsonlines/train_eval.jsonl +19 -19
  19. metrics/jsonlines/val.jsonl +49 -49
  20. metrics/npz/train_eval/step-000000104857600.npz +1 -1
  21. metrics/npz/train_eval/step-000000209715200.npz +1 -1
  22. metrics/npz/train_eval/step-000000314572800.npz +1 -1
  23. metrics/npz/train_eval/step-000000419430400.npz +1 -1
  24. metrics/npz/train_eval/step-000000524288000.npz +1 -1
  25. metrics/npz/train_eval/step-000000629145600.npz +1 -1
  26. metrics/npz/train_eval/step-000000734003200.npz +1 -1
  27. metrics/npz/train_eval/step-000000838860800.npz +1 -1
  28. metrics/npz/train_eval/step-000000943718400.npz +1 -1
  29. metrics/npz/train_eval/step-000001048576000.npz +1 -1
  30. metrics/npz/train_eval/step-000001153433600.npz +1 -1
  31. metrics/npz/train_eval/step-000001258291200.npz +1 -1
  32. metrics/npz/train_eval/step-000001363148800.npz +1 -1
  33. metrics/npz/train_eval/step-000001468006400.npz +1 -1
  34. metrics/npz/train_eval/step-000001572864000.npz +1 -1
  35. metrics/npz/train_eval/step-000001677721600.npz +1 -1
  36. metrics/npz/train_eval/step-000001782579200.npz +1 -1
  37. metrics/npz/train_eval/step-000001887436800.npz +1 -1
  38. metrics/npz/train_eval/step-000001992294400.npz +1 -1
  39. metrics/npz/val/step-000000041943040.npz +1 -1
  40. metrics/npz/val/step-000000083886080.npz +1 -1
  41. metrics/npz/val/step-000000125829120.npz +1 -1
  42. metrics/npz/val/step-000000167772160.npz +1 -1
  43. metrics/npz/val/step-000000209715200.npz +1 -1
  44. metrics/npz/val/step-000000251658240.npz +1 -1
  45. metrics/npz/val/step-000000293601280.npz +1 -1
  46. metrics/npz/val/step-000000335544320.npz +1 -1
  47. metrics/npz/val/step-000000377487360.npz +1 -1
  48. metrics/npz/val/step-000000419430400.npz +1 -1
  49. metrics/npz/val/step-000000461373440.npz +1 -1
  50. metrics/npz/val/step-000000503316480.npz +1 -1
.hydra/config.yaml CHANGED
@@ -81,7 +81,7 @@ train:
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
- gradient_checkpointing: true
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
 
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
+ gradient_checkpointing: false
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
checkpoints/step-000000209715200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29fd8196399c4483c8d0dd36172dcd5357b344cb7d987290d3ae3738370af50c
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c34944a1550c6f5600f84cc7882dca2919d13ce4c4e9fae926e63dfb344f3c70
3
+ size 339650826
checkpoints/step-000000419430400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed03eeedd6740fe5e0b577e6d63b6c367bf73c80639b0bcfbb7333757d85769d
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd341d9a4c51929b84dc8a73a22b61206c2d03a5214fa4ee13193ca5925b0e25
3
+ size 339650826
checkpoints/step-000000629145600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f5f874678886d8290c617479e7a473d4efdeac4fcaa26d0aed4acbae667faa9
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8722648f1c59174f8099ce91704394fc8cd1823334d3d796f5c9819609f797ac
3
+ size 339650826
checkpoints/step-000000838860800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6b0eef4f33e5c387c29464f25fdd5e054da835fe1a0b100ce87d1bfaabc4af5
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecf7abc065199334f11b6c1bf1f08612efe39dad0be4bcd78568c180f3a1dd0c
3
+ size 339650826
checkpoints/step-000001048576000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97a0fd7c28f539bbc9bfea7c21eee2b0f6929083c5412d9402e58127d1922c7a
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df6fdc4b678b92f48e22e1d1b9f5927eed7fa23928a20d7443734f87874c425f
3
+ size 339650826
checkpoints/step-000001258291200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f06d1cbd49c822165da339c488d9cb517b80a6ecddd50208fb6132d298c3e2d9
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:194ca13ea0e7718dc7526b0b6cadc94d274756747a7dc69c64e09c7cbbda7591
3
+ size 339650826
checkpoints/step-000001468006400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7ce6551ef5f5c2f4bea883a8775f2c1822f5a01871fcc647cb49cfb5aa30e5c
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d21e896616380f8e2813f69492bd9555e73c0b071bc701ed7cfe191c3ac8a2ea
3
+ size 339650826
checkpoints/step-000001677721600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:642ad5472d53df8af94ec16edf351aa79b6eeea3917af3deff0e28b441d7d26a
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece9f09ed797f107b247e982651935265c541ad1a99fa2fa9688b6f0b311020f
3
+ size 339650826
checkpoints/step-000001887436800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c4d01b6fd911272c98ca7d089362fada6cbd2f46789aa50d18feaf23be7ff94
3
- size 339651594
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:901dc8652d0257fdd73aa03f429d6c0e89b8f942bbd65181146e6d3acd3d574d
3
+ size 339650826
config.yaml CHANGED
@@ -81,7 +81,7 @@ train:
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
- gradient_checkpointing: true
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
 
81
  max_tokens: 2097152000
82
  grad_acc_tokens: 32768
83
  max_grad_norm: 1.0
84
+ gradient_checkpointing: false
85
  bias_weight_decay: false
86
  normalization_weight_decay: false
87
  conv_weight_decay: true
decay_params.txt CHANGED
@@ -1,20 +1,20 @@
1
- _forward_module._fsdp_wrapped_module.model.embeddings.weight
2
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight
3
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight
4
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight
5
- _forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight
6
- _forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight
7
- _forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight
8
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight
9
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight
10
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight
11
- _forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight
12
- _forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight
13
- _forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight
14
- _forward_module._fsdp_wrapped_module.model.layers.2.attn.q_proj.weight
15
- _forward_module._fsdp_wrapped_module.model.layers.2.attn.k_proj.weight
16
- _forward_module._fsdp_wrapped_module.model.layers.2.attn.v_proj.weight
17
- _forward_module._fsdp_wrapped_module.model.layers.2.attn.o_proj.weight
18
- _forward_module._fsdp_wrapped_module.model.layers.2.mlp.gate_proj.weight
19
- _forward_module._fsdp_wrapped_module.model.layers.2.mlp.down_proj.weight
20
  _forward_module._fsdp_wrapped_module.lm_head.weight
 
1
+ _forward_module._fsdp_wrapped_module.emb.weight
2
+ _forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight
3
+ _forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight
4
+ _forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight
5
+ _forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight
6
+ _forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight
7
+ _forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight
8
+ _forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight
9
+ _forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight
10
+ _forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight
11
+ _forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight
12
+ _forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight
13
+ _forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight
14
+ _forward_module._fsdp_wrapped_module.layers.2.attn.q_proj.weight
15
+ _forward_module._fsdp_wrapped_module.layers.2.attn.k_proj.weight
16
+ _forward_module._fsdp_wrapped_module.layers.2.attn.v_proj.weight
17
+ _forward_module._fsdp_wrapped_module.layers.2.attn.o_proj.weight
18
+ _forward_module._fsdp_wrapped_module.layers.2.mlp.gate_proj.weight
19
+ _forward_module._fsdp_wrapped_module.layers.2.mlp.down_proj.weight
20
  _forward_module._fsdp_wrapped_module.lm_head.weight
logs/2025-10-27_03-37-04.log ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-27 03:37:04][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_3_4_256`
2
+ [2025-10-27 03:37:04][train:375][INFO] Configuration:
3
+ [2025-10-27 03:37:04][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_3_4_256/config.yaml.
4
+ [2025-10-27 03:37:04][train:387][INFO] creating datamodule
5
+ [2025-10-27 03:37:04][train:419][INFO] creating model
6
+ [2025-10-27 03:37:05][train:440][INFO] creating optimizer
7
+ [2025-10-27 03:37:05][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-27 03:37:05][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-27 03:37:05][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-27 03:37:05][logger:288][INFO] wandb initialized. Run id: m4i3hjlc
11
+ [2025-10-27 03:37:05][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-27 03:37:05][logger:113][INFO] Setting up npz logger...
13
+ [2025-10-27 03:37:05][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
14
+ [2025-10-27 03:37:05][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
15
+ [2025-10-27 03:37:05][logger:171][INFO] [step: 0] [model_info/total_params: 28299520] [model_info/trainable_params: 28299520] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 15428608]
16
+ [2025-10-27 03:38:05][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:59] [ETA: 1:38:34] [loss: 9.926] [tokens/s: 374085.927] [batches/s: 0.178] [MFU: 0.000] [TFLOPS: 0.000]
17
+ [2025-10-27 03:39:01][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:55] [ETA: 1:34:32] [loss: 8.114] [tokens/s: 374185.677] [batches/s: 0.178] [MFU: 0.000] [TFLOPS: 0.000]
18
+ [2025-10-27 03:39:01][train:194][INFO] Running validation...
19
+ [2025-10-27 03:41:02][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 115.773] [val/train_update_time: 115.427] [val/loss: 8.016] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.225] [val/val_tokens_per_second: 340694.832] [val/loss_avg_len_2048: 8.016] [val/perplexity_len_2048: 3028.049] [val/loss_avg_len_1024: 8.014] [val/perplexity_len_1024: 3024.275] [val/loss_avg_len_512: 8.015] [val/perplexity_len_512: 3026.822]
20
+ [2025-10-27 03:41:58][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:04:52] [ETA: 2:37:22] [loss: 7.684] [tokens/s: 215098.637] [batches/s: 0.103] [MFU: 0.000] [TFLOPS: 0.000]
21
+ [2025-10-27 03:42:54][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:05:48] [ETA: 2:19:12] [loss: 7.448] [tokens/s: 241454.319] [batches/s: 0.115] [MFU: 0.000] [TFLOPS: 0.000]
22
+ [2025-10-27 03:42:54][train:194][INFO] Running validation...
23
+ [2025-10-27 03:44:54][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 348.020] [val/train_update_time: 227.194] [val/loss: 7.433] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.907] [val/val_tokens_per_second: 341597.369] [val/loss_avg_len_2048: 7.433] [val/perplexity_len_2048: 1690.691] [val/loss_avg_len_1024: 7.435] [val/perplexity_len_1024: 1693.835] [val/loss_avg_len_512: 7.441] [val/perplexity_len_512: 1705.115]
24
+ [2025-10-27 03:45:50][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:08:43] [ETA: 2:45:54] [loss: 7.295] [tokens/s: 199668.298] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
25
+ [2025-10-27 03:45:50][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 523.942] [train_eval/train_update_time: 283.084] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.444] [train_eval/perplexity_len_2048: 4646.953] [train_eval/loss_avg_len_1024: 8.446] [train_eval/perplexity_len_1024: 4655.095] [train_eval/loss_avg_len_512: 8.448] [train_eval/perplexity_len_512: 4665.833]
26
+ [2025-10-27 03:46:46][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:09:39] [ETA: 2:31:25] [loss: 7.112] [tokens/s: 216820.811] [batches/s: 0.103] [MFU: 0.000] [TFLOPS: 0.000]
27
+ [2025-10-27 03:46:46][train:194][INFO] Running validation...
28
+ [2025-10-27 03:48:46][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 579.950] [val/train_update_time: 338.966] [val/loss: 7.108] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.571] [val/val_tokens_per_second: 339716.833] [val/loss_avg_len_2048: 7.108] [val/perplexity_len_2048: 1221.876] [val/loss_avg_len_1024: 7.111] [val/perplexity_len_1024: 1225.866] [val/loss_avg_len_512: 7.121] [val/perplexity_len_512: 1237.723]
29
+ [2025-10-27 03:49:42][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:12:36] [ETA: 2:47:30] [loss: 6.983] [tokens/s: 193651.029] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
30
+ [2025-10-27 03:50:38][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:13:32] [ETA: 2:35:44] [loss: 6.816] [tokens/s: 206255.080] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000]
31
+ [2025-10-27 03:50:38][train:194][INFO] Running validation...
32
+ [2025-10-27 03:52:38][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 812.539] [val/train_update_time: 450.724] [val/loss: 6.803] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.847] [val/val_tokens_per_second: 341769.433] [val/loss_avg_len_2048: 6.803] [val/perplexity_len_2048: 900.727] [val/loss_avg_len_1024: 6.809] [val/perplexity_len_1024: 905.751] [val/loss_avg_len_512: 6.823] [val/perplexity_len_512: 918.477]
33
+ [2025-10-27 03:53:34][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:16:28] [ETA: 2:46:33] [loss: 6.659] [tokens/s: 190629.413] [batches/s: 0.091] [MFU: 0.000] [TFLOPS: 0.000]
34
+ [2025-10-27 03:54:30][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:17:24] [ETA: 2:36:39] [loss: 6.568] [tokens/s: 200575.613] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
35
+ [2025-10-27 03:54:30][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1044.397] [train_eval/train_update_time: 562.480] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.904] [train_eval/perplexity_len_2048: 995.979] [train_eval/loss_avg_len_1024: 6.910] [train_eval/perplexity_len_1024: 1002.480] [train_eval/loss_avg_len_512: 6.921] [train_eval/perplexity_len_512: 1013.785]
36
+ [2025-10-27 03:54:30][train:194][INFO] Running validation...
37
+ [2025-10-27 03:56:31][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 1044.397] [val/train_update_time: 562.480] [val/loss: 6.553] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.570] [val/val_tokens_per_second: 339718.456] [val/loss_avg_len_2048: 6.553] [val/perplexity_len_2048: 701.475] [val/loss_avg_len_1024: 6.560] [val/perplexity_len_1024: 706.290] [val/loss_avg_len_512: 6.577] [val/perplexity_len_512: 718.225]
38
+ [2025-10-27 03:56:31][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000209715200.pt...
39
+ [2025-10-27 03:56:31][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000209715200.pt.
40
+ [2025-10-27 03:56:31][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.455]
41
+ [2025-10-27 03:57:27][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:20:21] [ETA: 2:44:42] [loss: 6.480] [tokens/s: 179587.760] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
42
+ [2025-10-27 03:58:23][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:21:17] [ETA: 2:36:08] [loss: 6.361] [tokens/s: 200431.941] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
43
+ [2025-10-27 03:58:23][train:194][INFO] Running validation...
44
+ [2025-10-27 04:00:23][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1277.468] [val/train_update_time: 674.263] [val/loss: 6.369] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.470] [val/val_tokens_per_second: 342847.910] [val/loss_avg_len_2048: 6.369] [val/perplexity_len_2048: 583.192] [val/loss_avg_len_1024: 6.376] [val/perplexity_len_1024: 587.741] [val/loss_avg_len_512: 6.395] [val/perplexity_len_512: 598.664]
45
+ [2025-10-27 04:01:19][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:24:12] [ETA: 2:42:03] [loss: 6.323] [tokens/s: 179702.779] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
46
+ [2025-10-27 04:02:15][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:25:09] [ETA: 2:34:29] [loss: 6.252] [tokens/s: 200506.171] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
47
+ [2025-10-27 04:02:15][train:194][INFO] Running validation...
48
+ [2025-10-27 04:04:15][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1509.006] [val/train_update_time: 786.073] [val/loss: 6.228] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.570] [val/val_tokens_per_second: 339719.144] [val/loss_avg_len_2048: 6.228] [val/perplexity_len_2048: 506.910] [val/loss_avg_len_1024: 6.237] [val/perplexity_len_1024: 511.099] [val/loss_avg_len_512: 6.256] [val/perplexity_len_512: 521.062]
49
+ [2025-10-27 04:05:11][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:28:05] [ETA: 2:39:11] [loss: 6.138] [tokens/s: 179593.825] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
50
+ [2025-10-27 04:05:11][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1685.597] [train_eval/train_update_time: 841.968] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.347] [train_eval/perplexity_len_2048: 570.689] [train_eval/loss_avg_len_1024: 6.356] [train_eval/perplexity_len_1024: 575.860] [train_eval/loss_avg_len_512: 6.374] [train_eval/perplexity_len_512: 586.376]
51
+ [2025-10-27 04:06:07][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:29:01] [ETA: 2:32:23] [loss: 6.105] [tokens/s: 200502.712] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
52
+ [2025-10-27 04:06:07][train:194][INFO] Running validation...
53
+ [2025-10-27 04:08:08][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1741.618] [val/train_update_time: 897.865] [val/loss: 6.099] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.504] [val/val_tokens_per_second: 339904.656] [val/loss_avg_len_2048: 6.099] [val/perplexity_len_2048: 445.490] [val/loss_avg_len_1024: 6.108] [val/perplexity_len_1024: 449.482] [val/loss_avg_len_512: 6.129] [val/perplexity_len_512: 458.832]
54
+ [2025-10-27 04:09:04][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:31:58] [ETA: 2:36:05] [loss: 6.023] [tokens/s: 179598.821] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
55
+ [2025-10-27 04:10:00][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:32:54] [ETA: 2:29:53] [loss: 5.996] [tokens/s: 200371.906] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
56
+ [2025-10-27 04:10:00][train:194][INFO] Running validation...
57
+ [2025-10-27 04:11:59][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1974.161] [val/train_update_time: 1009.651] [val/loss: 5.986] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.612] [val/val_tokens_per_second: 342440.603] [val/loss_avg_len_2048: 5.986] [val/perplexity_len_2048: 397.834] [val/loss_avg_len_1024: 5.996] [val/perplexity_len_1024: 401.624] [val/loss_avg_len_512: 6.017] [val/perplexity_len_512: 410.374]
58
+ [2025-10-27 04:12:55][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:35:49] [ETA: 2:32:44] [loss: 5.970] [tokens/s: 179632.307] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
59
+ [2025-10-27 04:13:51][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:36:45] [ETA: 2:27:03] [loss: 5.881] [tokens/s: 200641.961] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
60
+ [2025-10-27 04:13:51][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2205.813] [train_eval/train_update_time: 1121.443] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.021] [train_eval/perplexity_len_2048: 411.906] [train_eval/loss_avg_len_1024: 6.031] [train_eval/perplexity_len_1024: 416.312] [train_eval/loss_avg_len_512: 6.051] [train_eval/perplexity_len_512: 424.479]
61
+ [2025-10-27 04:13:51][train:194][INFO] Running validation...
62
+ [2025-10-27 04:15:51][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 2205.813] [val/train_update_time: 1121.443] [val/loss: 5.883] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.178] [val/val_tokens_per_second: 343687.850] [val/loss_avg_len_2048: 5.883] [val/perplexity_len_2048: 358.943] [val/loss_avg_len_1024: 5.893] [val/perplexity_len_1024: 362.558] [val/loss_avg_len_512: 5.916] [val/perplexity_len_512: 370.873]
63
+ [2025-10-27 04:15:51][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000419430400.pt...
64
+ [2025-10-27 04:15:51][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000419430400.pt.
65
+ [2025-10-27 04:15:51][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.455]
66
+ [2025-10-27 04:16:47][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:39:41] [ETA: 2:29:18] [loss: 5.837] [tokens/s: 179848.814] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
67
+ [2025-10-27 04:17:43][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:40:37] [ETA: 2:24:01] [loss: 5.823] [tokens/s: 200615.467] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
68
+ [2025-10-27 04:17:43][train:194][INFO] Running validation...
69
+ [2025-10-27 04:19:43][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2437.460] [val/train_update_time: 1233.201] [val/loss: 5.795] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.896] [val/val_tokens_per_second: 341630.119] [val/loss_avg_len_2048: 5.795] [val/perplexity_len_2048: 328.687] [val/loss_avg_len_1024: 5.806] [val/perplexity_len_1024: 332.227] [val/loss_avg_len_512: 5.830] [val/perplexity_len_512: 340.216]
70
+ [2025-10-27 04:20:39][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:43:33] [ETA: 2:25:49] [loss: 5.743] [tokens/s: 179790.652] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
71
+ [2025-10-27 04:21:35][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:44:29] [ETA: 2:20:52] [loss: 5.686] [tokens/s: 200756.837] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
72
+ [2025-10-27 04:21:35][train:194][INFO] Running validation...
73
+ [2025-10-27 04:23:36][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2669.367] [val/train_update_time: 1344.966] [val/loss: 5.713] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.506] [val/val_tokens_per_second: 339899.240] [val/loss_avg_len_2048: 5.713] [val/perplexity_len_2048: 302.919] [val/loss_avg_len_1024: 5.725] [val/perplexity_len_1024: 306.316] [val/loss_avg_len_512: 5.749] [val/perplexity_len_512: 313.805]
74
+ [2025-10-27 04:24:32][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:47:25] [ETA: 2:22:17] [loss: 5.691] [tokens/s: 179808.593] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
75
+ [2025-10-27 04:24:32][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2845.865] [train_eval/train_update_time: 1400.844] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.778] [train_eval/perplexity_len_2048: 323.158] [train_eval/loss_avg_len_1024: 5.789] [train_eval/perplexity_len_1024: 326.574] [train_eval/loss_avg_len_512: 5.810] [train_eval/perplexity_len_512: 333.500]
76
+ [2025-10-27 04:25:28][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:48:21] [ETA: 2:17:39] [loss: 5.634] [tokens/s: 200763.021] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
77
+ [2025-10-27 04:25:28][train:194][INFO] Running validation...
78
+ [2025-10-27 04:27:28][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2901.881] [val/train_update_time: 1456.732] [val/loss: 5.634] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.060] [val/val_tokens_per_second: 341161.590] [val/loss_avg_len_2048: 5.634] [val/perplexity_len_2048: 279.814] [val/loss_avg_len_1024: 5.646] [val/perplexity_len_1024: 283.147] [val/loss_avg_len_512: 5.671] [val/perplexity_len_512: 290.398]
79
+ [2025-10-27 04:28:24][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:51:17] [ETA: 2:18:41] [loss: 5.615] [tokens/s: 179880.368] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
80
+ [2025-10-27 04:29:20][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:52:13] [ETA: 2:14:18] [loss: 5.561] [tokens/s: 200682.059] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
81
+ [2025-10-27 04:29:20][train:194][INFO] Running validation...
82
+ [2025-10-27 04:31:19][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 3133.951] [val/train_update_time: 1568.504] [val/loss: 5.565] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.844] [val/val_tokens_per_second: 341776.578] [val/loss_avg_len_2048: 5.565] [val/perplexity_len_2048: 261.231] [val/loss_avg_len_1024: 5.578] [val/perplexity_len_1024: 264.445] [val/loss_avg_len_512: 5.603] [val/perplexity_len_512: 271.330]
83
+ [2025-10-27 04:32:15][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:55:09] [ETA: 2:15:03] [loss: 5.527] [tokens/s: 179848.176] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
84
+ [2025-10-27 04:33:11][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:56:05] [ETA: 2:10:53] [loss: 5.491] [tokens/s: 200645.256] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
85
+ [2025-10-27 04:33:11][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3365.816] [train_eval/train_update_time: 1680.293] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.594] [train_eval/perplexity_len_2048: 268.930] [train_eval/loss_avg_len_1024: 5.605] [train_eval/perplexity_len_1024: 271.718] [train_eval/loss_avg_len_512: 5.627] [train_eval/perplexity_len_512: 277.868]
86
+ [2025-10-27 04:33:11][train:194][INFO] Running validation...
87
+ [2025-10-27 04:35:11][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3365.816] [val/train_update_time: 1680.293] [val/loss: 5.506] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.833] [val/val_tokens_per_second: 341809.430] [val/loss_avg_len_2048: 5.506] [val/perplexity_len_2048: 246.042] [val/loss_avg_len_1024: 5.518] [val/perplexity_len_1024: 249.139] [val/loss_avg_len_512: 5.544] [val/perplexity_len_512: 255.820]
88
+ [2025-10-27 04:35:11][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000629145600.pt...
89
+ [2025-10-27 04:35:12][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000629145600.pt.
90
+ [2025-10-27 04:35:12][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.459]
91
+ [2025-10-27 04:36:08][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:59:02] [ETA: 2:11:24] [loss: 5.506] [tokens/s: 179748.182] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
92
+ [2025-10-27 04:37:04][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:59:58] [ETA: 2:07:25] [loss: 5.437] [tokens/s: 200568.874] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
93
+ [2025-10-27 04:37:04][train:194][INFO] Running validation...
94
+ [2025-10-27 04:39:04][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3598.115] [val/train_update_time: 1792.082] [val/loss: 5.452] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.860] [val/val_tokens_per_second: 341732.461] [val/loss_avg_len_2048: 5.452] [val/perplexity_len_2048: 233.196] [val/loss_avg_len_1024: 5.465] [val/perplexity_len_1024: 236.183] [val/loss_avg_len_512: 5.491] [val/perplexity_len_512: 242.552]
95
+ [2025-10-27 04:40:00][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:02:53] [ETA: 2:07:42] [loss: 5.447] [tokens/s: 179751.110] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
96
+ [2025-10-27 04:40:56][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:03:49] [ETA: 2:03:54] [loss: 5.411] [tokens/s: 200693.607] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
97
+ [2025-10-27 04:40:56][train:194][INFO] Running validation...
98
+ [2025-10-27 04:42:56][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3829.989] [val/train_update_time: 1903.885] [val/loss: 5.409] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.898] [val/val_tokens_per_second: 341623.094] [val/loss_avg_len_2048: 5.409] [val/perplexity_len_2048: 223.310] [val/loss_avg_len_1024: 5.422] [val/perplexity_len_1024: 226.222] [val/loss_avg_len_512: 5.448] [val/perplexity_len_512: 232.368]
99
+ [2025-10-27 04:43:52][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:06:45] [ETA: 2:03:59] [loss: 5.392] [tokens/s: 179847.137] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
100
+ [2025-10-27 04:43:52][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4005.888] [train_eval/train_update_time: 1959.769] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.446] [train_eval/perplexity_len_2048: 231.832] [train_eval/loss_avg_len_1024: 5.459] [train_eval/perplexity_len_1024: 234.791] [train_eval/loss_avg_len_512: 5.483] [train_eval/perplexity_len_512: 240.578]
101
+ [2025-10-27 04:44:48][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:07:41] [ETA: 2:00:21] [loss: 5.313] [tokens/s: 200728.016] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
102
+ [2025-10-27 04:44:48][train:194][INFO] Running validation...
103
+ [2025-10-27 04:46:48][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 4061.882] [val/train_update_time: 2015.659] [val/loss: 5.361] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.402] [val/val_tokens_per_second: 340193.085] [val/loss_avg_len_2048: 5.361] [val/perplexity_len_2048: 212.879] [val/loss_avg_len_1024: 5.374] [val/perplexity_len_1024: 215.726] [val/loss_avg_len_512: 5.401] [val/perplexity_len_512: 221.708]
104
+ [2025-10-27 04:47:44][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:10:38] [ETA: 2:00:16] [loss: 5.341] [tokens/s: 179800.569] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
105
+ [2025-10-27 04:48:40][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:11:34] [ETA: 1:56:46] [loss: 5.317] [tokens/s: 200620.763] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
106
+ [2025-10-27 04:48:40][train:194][INFO] Running validation...
107
+ [2025-10-27 04:50:40][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 4294.288] [val/train_update_time: 2127.429] [val/loss: 5.324] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.297] [val/val_tokens_per_second: 340491.152] [val/loss_avg_len_2048: 5.324] [val/perplexity_len_2048: 205.267] [val/loss_avg_len_1024: 5.338] [val/perplexity_len_1024: 208.015] [val/loss_avg_len_512: 5.365] [val/perplexity_len_512: 213.819]
108
+ [2025-10-27 04:51:36][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:14:30] [ETA: 1:56:32] [loss: 5.319] [tokens/s: 179730.346] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
109
+ [2025-10-27 04:52:32][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:15:26] [ETA: 1:53:09] [loss: 5.243] [tokens/s: 200620.450] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
110
+ [2025-10-27 04:52:32][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4526.608] [train_eval/train_update_time: 2239.198] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.336] [train_eval/perplexity_len_2048: 207.627] [train_eval/loss_avg_len_1024: 5.348] [train_eval/perplexity_len_1024: 210.191] [train_eval/loss_avg_len_512: 5.373] [train_eval/perplexity_len_512: 215.473]
111
+ [2025-10-27 04:52:32][train:194][INFO] Running validation...
112
+ [2025-10-27 04:54:33][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4526.608] [val/train_update_time: 2239.198] [val/loss: 5.287] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.714] [val/val_tokens_per_second: 339313.701] [val/loss_avg_len_2048: 5.287] [val/perplexity_len_2048: 197.731] [val/loss_avg_len_1024: 5.300] [val/perplexity_len_1024: 200.423] [val/loss_avg_len_512: 5.328] [val/perplexity_len_512: 206.096]
113
+ [2025-10-27 04:54:33][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000838860800.pt...
114
+ [2025-10-27 04:54:33][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000838860800.pt.
115
+ [2025-10-27 04:54:33][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.459]
116
+ [2025-10-27 04:55:29][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:18:23] [ETA: 1:52:48] [loss: 5.248] [tokens/s: 179589.730] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
117
+ [2025-10-27 04:56:25][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:19:19] [ETA: 1:49:33] [loss: 5.253] [tokens/s: 200364.752] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
118
+ [2025-10-27 04:56:25][train:194][INFO] Running validation...
119
+ [2025-10-27 04:58:26][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4759.794] [val/train_update_time: 2350.987] [val/loss: 5.255] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.730] [val/val_tokens_per_second: 339268.688] [val/loss_avg_len_2048: 5.255] [val/perplexity_len_2048: 191.460] [val/loss_avg_len_1024: 5.269] [val/perplexity_len_1024: 194.143] [val/loss_avg_len_512: 5.297] [val/perplexity_len_512: 199.680]
120
+ [2025-10-27 04:59:22][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:22:16] [ETA: 1:49:03] [loss: 5.238] [tokens/s: 179454.865] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
121
+ [2025-10-27 05:00:18][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:23:12] [ETA: 1:45:54] [loss: 5.259] [tokens/s: 200198.746] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
122
+ [2025-10-27 05:00:18][train:194][INFO] Running validation...
123
+ [2025-10-27 05:02:18][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 4992.560] [val/train_update_time: 2462.802] [val/loss: 5.223] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.625] [val/val_tokens_per_second: 342404.151] [val/loss_avg_len_2048: 5.223] [val/perplexity_len_2048: 185.522] [val/loss_avg_len_1024: 5.237] [val/perplexity_len_1024: 188.170] [val/loss_avg_len_512: 5.266] [val/perplexity_len_512: 193.639]
124
+ [2025-10-27 05:03:14][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:26:08] [ETA: 1:45:16] [loss: 5.206] [tokens/s: 179491.302] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
125
+ [2025-10-27 05:03:14][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5168.203] [train_eval/train_update_time: 2518.705] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.247] [train_eval/perplexity_len_2048: 189.911] [train_eval/loss_avg_len_1024: 5.261] [train_eval/perplexity_len_1024: 192.634] [train_eval/loss_avg_len_512: 5.288] [train_eval/perplexity_len_512: 197.932]
126
+ [2025-10-27 05:04:10][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:27:04] [ETA: 1:42:12] [loss: 5.177] [tokens/s: 200344.522] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
127
+ [2025-10-27 05:04:10][train:194][INFO] Running validation...
128
+ [2025-10-27 05:06:09][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 5224.203] [val/train_update_time: 2574.594] [val/loss: 5.195] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.144] [val/val_tokens_per_second: 343785.878] [val/loss_avg_len_2048: 5.195] [val/perplexity_len_2048: 180.375] [val/loss_avg_len_1024: 5.209] [val/perplexity_len_1024: 182.956] [val/loss_avg_len_512: 5.238] [val/perplexity_len_512: 188.280]
129
+ [2025-10-27 05:07:05][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:29:59] [ETA: 1:41:28] [loss: 5.196] [tokens/s: 179680.911] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
130
+ [2025-10-27 05:08:01][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:30:55] [ETA: 1:38:29] [loss: 5.169] [tokens/s: 200561.954] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
131
+ [2025-10-27 05:08:01][train:194][INFO] Running validation...
132
+ [2025-10-27 05:10:01][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 5455.381] [val/train_update_time: 2686.379] [val/loss: 5.172] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.802] [val/val_tokens_per_second: 341896.504] [val/loss_avg_len_2048: 5.172] [val/perplexity_len_2048: 176.330] [val/loss_avg_len_1024: 5.187] [val/perplexity_len_1024: 178.843] [val/loss_avg_len_512: 5.215] [val/perplexity_len_512: 184.086]
133
+ [2025-10-27 05:10:57][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:33:51] [ETA: 1:37:41] [loss: 5.173] [tokens/s: 179753.535] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
134
+ [2025-10-27 05:11:53][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:34:47] [ETA: 1:34:47] [loss: 5.149] [tokens/s: 200824.829] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
135
+ [2025-10-27 05:11:53][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5687.227] [train_eval/train_update_time: 2798.155] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.178] [train_eval/perplexity_len_2048: 177.286] [train_eval/loss_avg_len_1024: 5.191] [train_eval/perplexity_len_1024: 179.726] [train_eval/loss_avg_len_512: 5.219] [train_eval/perplexity_len_512: 184.822]
136
+ [2025-10-27 05:11:53][train:194][INFO] Running validation...
137
+ [2025-10-27 05:13:52][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5687.227] [val/train_update_time: 2798.155] [val/loss: 5.148] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.429] [val/val_tokens_per_second: 342965.072] [val/loss_avg_len_2048: 5.148] [val/perplexity_len_2048: 172.020] [val/loss_avg_len_1024: 5.162] [val/perplexity_len_1024: 174.547] [val/loss_avg_len_512: 5.192] [val/perplexity_len_512: 179.740]
138
+ [2025-10-27 05:13:52][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001048576000.pt...
139
+ [2025-10-27 05:13:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001048576000.pt.
140
+ [2025-10-27 05:13:53][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.454]
141
+ [2025-10-27 05:14:49][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:37:43] [ETA: 1:33:53] [loss: 5.146] [tokens/s: 179954.513] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
142
+ [2025-10-27 05:15:45][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:38:39] [ETA: 1:31:03] [loss: 5.138] [tokens/s: 200985.257] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
143
+ [2025-10-27 05:15:45][train:194][INFO] Running validation...
144
+ [2025-10-27 05:17:45][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 5919.139] [val/train_update_time: 2909.934] [val/loss: 5.129] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.833] [val/val_tokens_per_second: 341809.707] [val/loss_avg_len_2048: 5.129] [val/perplexity_len_2048: 168.872] [val/loss_avg_len_1024: 5.144] [val/perplexity_len_1024: 171.367] [val/loss_avg_len_512: 5.173] [val/perplexity_len_512: 176.448]
145
+ [2025-10-27 05:18:41][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:41:34] [ETA: 1:30:04] [loss: 5.096] [tokens/s: 180092.637] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
146
+ [2025-10-27 05:19:37][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:42:31] [ETA: 1:27:19] [loss: 5.110] [tokens/s: 200946.078] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
147
+ [2025-10-27 05:19:37][train:194][INFO] Running validation...
148
+ [2025-10-27 05:21:37][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 6151.006] [val/train_update_time: 3021.723] [val/loss: 5.107] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.466] [val/val_tokens_per_second: 340013.135] [val/loss_avg_len_2048: 5.107] [val/perplexity_len_2048: 165.223] [val/loss_avg_len_1024: 5.122] [val/perplexity_len_1024: 167.702] [val/loss_avg_len_512: 5.152] [val/perplexity_len_512: 172.729]
149
+ [2025-10-27 05:22:33][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:45:27] [ETA: 1:26:17] [loss: 5.059] [tokens/s: 179961.857] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
150
+ [2025-10-27 05:22:33][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6327.491] [train_eval/train_update_time: 3077.614] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.120] [train_eval/perplexity_len_2048: 167.296] [train_eval/loss_avg_len_1024: 5.131] [train_eval/perplexity_len_1024: 169.181] [train_eval/loss_avg_len_512: 5.157] [train_eval/perplexity_len_512: 173.589]
151
+ [2025-10-27 05:23:29][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:46:23] [ETA: 1:23:35] [loss: 5.097] [tokens/s: 200683.231] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
152
+ [2025-10-27 05:23:29][train:194][INFO] Running validation...
153
+ [2025-10-27 05:25:30][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 6383.517] [val/train_update_time: 3133.514] [val/loss: 5.090] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.480] [val/val_tokens_per_second: 339973.275] [val/loss_avg_len_2048: 5.090] [val/perplexity_len_2048: 162.439] [val/loss_avg_len_1024: 5.105] [val/perplexity_len_1024: 164.907] [val/loss_avg_len_512: 5.135] [val/perplexity_len_512: 169.884]
154
+ [2025-10-27 05:26:26][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:49:20] [ETA: 1:22:28] [loss: 5.066] [tokens/s: 179747.453] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
155
+ [2025-10-27 05:27:22][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:50:16] [ETA: 1:19:50] [loss: 5.097] [tokens/s: 200547.196] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
156
+ [2025-10-27 05:27:22][train:194][INFO] Running validation...
157
+ [2025-10-27 05:29:22][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 6616.056] [val/train_update_time: 3245.323] [val/loss: 5.075] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.296] [val/val_tokens_per_second: 340492.586] [val/loss_avg_len_2048: 5.075] [val/perplexity_len_2048: 159.910] [val/loss_avg_len_1024: 5.090] [val/perplexity_len_1024: 162.345] [val/loss_avg_len_512: 5.120] [val/perplexity_len_512: 167.255]
158
+ [2025-10-27 05:30:18][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:53:12] [ETA: 1:18:40] [loss: 5.074] [tokens/s: 179667.132] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
159
+ [2025-10-27 05:31:14][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:54:08] [ETA: 1:16:05] [loss: 5.077] [tokens/s: 200469.004] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
160
+ [2025-10-27 05:31:14][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6848.388] [train_eval/train_update_time: 3357.135] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.074] [train_eval/perplexity_len_2048: 159.838] [train_eval/loss_avg_len_1024: 5.085] [train_eval/perplexity_len_1024: 161.616] [train_eval/loss_avg_len_512: 5.112] [train_eval/perplexity_len_512: 166.022]
161
+ [2025-10-27 05:31:14][train:194][INFO] Running validation...
162
+ [2025-10-27 05:33:15][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6848.388] [val/train_update_time: 3357.135] [val/loss: 5.058] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.721] [val/val_tokens_per_second: 339295.425] [val/loss_avg_len_2048: 5.058] [val/perplexity_len_2048: 157.273] [val/loss_avg_len_1024: 5.073] [val/perplexity_len_1024: 159.697] [val/loss_avg_len_512: 5.103] [val/perplexity_len_512: 164.585]
163
+ [2025-10-27 05:33:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001258291200.pt...
164
+ [2025-10-27 05:33:15][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001258291200.pt.
165
+ [2025-10-27 05:33:15][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.462]
166
+ [2025-10-27 05:34:11][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:57:05] [ETA: 1:14:51] [loss: 5.060] [tokens/s: 179463.572] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
167
+ [2025-10-27 05:35:07][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:58:01] [ETA: 1:12:20] [loss: 5.036] [tokens/s: 200202.368] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
168
+ [2025-10-27 05:35:07][train:194][INFO] Running validation...
169
+ [2025-10-27 05:37:07][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 7081.627] [val/train_update_time: 3468.931] [val/loss: 5.044] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.076] [val/val_tokens_per_second: 341117.977] [val/loss_avg_len_2048: 5.044] [val/perplexity_len_2048: 155.132] [val/loss_avg_len_1024: 5.060] [val/perplexity_len_1024: 157.543] [val/loss_avg_len_512: 5.090] [val/perplexity_len_512: 162.375]
170
+ [2025-10-27 05:38:03][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 2:00:57] [ETA: 1:11:02] [loss: 5.049] [tokens/s: 179421.736] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
171
+ [2025-10-27 05:38:59][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:01:53] [ETA: 1:08:33] [loss: 5.052] [tokens/s: 200271.931] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
172
+ [2025-10-27 05:38:59][train:194][INFO] Running validation...
173
+ [2025-10-27 05:41:00][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 7313.765] [val/train_update_time: 3580.737] [val/loss: 5.032] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.131] [val/val_tokens_per_second: 340960.982] [val/loss_avg_len_2048: 5.032] [val/perplexity_len_2048: 153.237] [val/loss_avg_len_1024: 5.048] [val/perplexity_len_1024: 155.659] [val/loss_avg_len_512: 5.078] [val/perplexity_len_512: 160.474]
174
+ [2025-10-27 05:41:56][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:04:49] [ETA: 1:07:13] [loss: 5.023] [tokens/s: 179470.763] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
175
+ [2025-10-27 05:41:56][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7489.941] [train_eval/train_update_time: 3636.641] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.047] [train_eval/perplexity_len_2048: 155.507] [train_eval/loss_avg_len_1024: 5.062] [train_eval/perplexity_len_1024: 157.914] [train_eval/loss_avg_len_512: 5.089] [train_eval/perplexity_len_512: 162.248]
176
+ [2025-10-27 05:42:52][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:05:45] [ETA: 1:04:47] [loss: 5.044] [tokens/s: 200330.870] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
177
+ [2025-10-27 05:42:52][train:194][INFO] Running validation...
178
+ [2025-10-27 05:44:52][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 7545.991] [val/train_update_time: 3692.567] [val/loss: 5.021] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.522] [val/val_tokens_per_second: 339853.831] [val/loss_avg_len_2048: 5.021] [val/perplexity_len_2048: 151.515] [val/loss_avg_len_1024: 5.036] [val/perplexity_len_1024: 153.913] [val/loss_avg_len_512: 5.067] [val/perplexity_len_512: 158.695]
179
+ [2025-10-27 05:45:48][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 2:08:42] [ETA: 1:03:23] [loss: 4.999] [tokens/s: 179462.528] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
180
+ [2025-10-27 05:46:44][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 2:09:38] [ETA: 1:01:00] [loss: 5.004] [tokens/s: 200290.483] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
181
+ [2025-10-27 05:46:44][train:194][INFO] Running validation...
182
+ [2025-10-27 05:48:45][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 7778.553] [val/train_update_time: 3804.371] [val/loss: 5.010] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.440] [val/val_tokens_per_second: 340087.108] [val/loss_avg_len_2048: 5.010] [val/perplexity_len_2048: 149.901] [val/loss_avg_len_1024: 5.026] [val/perplexity_len_1024: 152.291] [val/loss_avg_len_512: 5.057] [val/perplexity_len_512: 157.056]
183
+ [2025-10-27 05:49:41][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 2:12:34] [ETA: 0:59:33] [loss: 5.016] [tokens/s: 179445.359] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
184
+ [2025-10-27 05:50:37][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 2:13:30] [ETA: 0:57:13] [loss: 5.002] [tokens/s: 200441.023] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
185
+ [2025-10-27 05:50:37][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8010.997] [train_eval/train_update_time: 3916.130] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.013] [train_eval/perplexity_len_2048: 150.319] [train_eval/loss_avg_len_1024: 5.028] [train_eval/perplexity_len_1024: 152.581] [train_eval/loss_avg_len_512: 5.057] [train_eval/perplexity_len_512: 157.196]
186
+ [2025-10-27 05:50:37][train:194][INFO] Running validation...
187
+ [2025-10-27 05:52:37][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 8010.997] [val/train_update_time: 3916.130] [val/loss: 5.000] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.770] [val/val_tokens_per_second: 339157.395] [val/loss_avg_len_2048: 5.000] [val/perplexity_len_2048: 148.462] [val/loss_avg_len_1024: 5.016] [val/perplexity_len_1024: 150.826] [val/loss_avg_len_512: 5.047] [val/perplexity_len_512: 155.549]
188
+ [2025-10-27 05:52:37][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001468006400.pt...
189
+ [2025-10-27 05:52:38][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001468006400.pt.
190
+ [2025-10-27 05:52:38][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.460]
191
+ [2025-10-27 05:53:34][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:16:28] [ETA: 0:55:44] [loss: 4.992] [tokens/s: 179439.774] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
192
+ [2025-10-27 05:54:30][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:17:24] [ETA: 0:53:26] [loss: 4.999] [tokens/s: 200221.394] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
193
+ [2025-10-27 05:54:30][train:194][INFO] Running validation...
194
+ [2025-10-27 05:56:31][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 8244.261] [val/train_update_time: 4027.906] [val/loss: 4.992] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 121.046] [val/val_tokens_per_second: 338383.251] [val/loss_avg_len_2048: 4.992] [val/perplexity_len_2048: 147.249] [val/loss_avg_len_1024: 5.008] [val/perplexity_len_1024: 149.610] [val/loss_avg_len_512: 5.039] [val/perplexity_len_512: 154.315]
195
+ [2025-10-27 05:57:27][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:20:21] [ETA: 0:51:54] [loss: 5.009] [tokens/s: 179292.560] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
196
+ [2025-10-27 05:58:23][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:21:17] [ETA: 0:49:38] [loss: 5.006] [tokens/s: 200045.819] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
197
+ [2025-10-27 05:58:23][train:194][INFO] Running validation...
198
+ [2025-10-27 06:00:24][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 8477.365] [val/train_update_time: 4139.697] [val/loss: 4.985] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.921] [val/val_tokens_per_second: 338732.788] [val/loss_avg_len_2048: 4.985] [val/perplexity_len_2048: 146.197] [val/loss_avg_len_1024: 5.001] [val/perplexity_len_1024: 148.550] [val/loss_avg_len_512: 5.032] [val/perplexity_len_512: 153.234]
199
+ [2025-10-27 06:01:20][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:24:14] [ETA: 0:48:04] [loss: 4.996] [tokens/s: 179176.823] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
200
+ [2025-10-27 06:01:20][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8654.290] [train_eval/train_update_time: 4195.576] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.991] [train_eval/perplexity_len_2048: 147.115] [train_eval/loss_avg_len_1024: 5.007] [train_eval/perplexity_len_1024: 149.472] [train_eval/loss_avg_len_512: 5.036] [train_eval/perplexity_len_512: 153.896]
201
+ [2025-10-27 06:02:16][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:25:10] [ETA: 0:45:50] [loss: 4.950] [tokens/s: 199987.850] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
202
+ [2025-10-27 06:02:16][train:194][INFO] Running validation...
203
+ [2025-10-27 06:04:16][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 8710.281] [val/train_update_time: 4251.452] [val/loss: 4.978] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.104] [val/val_tokens_per_second: 341038.277] [val/loss_avg_len_2048: 4.978] [val/perplexity_len_2048: 145.248] [val/loss_avg_len_1024: 4.994] [val/perplexity_len_1024: 147.592] [val/loss_avg_len_512: 5.026] [val/perplexity_len_512: 152.251]
204
+ [2025-10-27 06:05:12][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:28:06] [ETA: 0:44:14] [loss: 5.011] [tokens/s: 179255.807] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
205
+ [2025-10-27 06:06:08][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:29:02] [ETA: 0:42:02] [loss: 4.936] [tokens/s: 200070.319] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
206
+ [2025-10-27 06:06:08][train:194][INFO] Running validation...
207
+ [2025-10-27 06:08:09][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 8942.332] [val/train_update_time: 4363.190] [val/loss: 4.973] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.597] [val/val_tokens_per_second: 339644.444] [val/loss_avg_len_2048: 4.973] [val/perplexity_len_2048: 144.435] [val/loss_avg_len_1024: 4.989] [val/perplexity_len_1024: 146.770] [val/loss_avg_len_512: 5.020] [val/perplexity_len_512: 151.424]
208
+ [2025-10-27 06:09:05][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:31:58] [ETA: 0:40:24] [loss: 4.971] [tokens/s: 179237.454] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
209
+ [2025-10-27 06:10:01][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:32:54] [ETA: 0:38:13] [loss: 4.937] [tokens/s: 200192.195] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
210
+ [2025-10-27 06:10:01][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9174.941] [train_eval/train_update_time: 4474.963] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.975] [train_eval/perplexity_len_2048: 144.752] [train_eval/loss_avg_len_1024: 4.992] [train_eval/perplexity_len_1024: 147.198] [train_eval/loss_avg_len_512: 5.021] [train_eval/perplexity_len_512: 151.545]
211
+ [2025-10-27 06:10:01][train:194][INFO] Running validation...
212
+ [2025-10-27 06:12:01][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 9174.941] [val/train_update_time: 4474.963] [val/loss: 4.968] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.183] [val/val_tokens_per_second: 340813.994] [val/loss_avg_len_2048: 4.968] [val/perplexity_len_2048: 143.745] [val/loss_avg_len_1024: 4.984] [val/perplexity_len_1024: 146.084] [val/loss_avg_len_512: 5.015] [val/perplexity_len_512: 150.727]
213
+ [2025-10-27 06:12:01][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001677721600.pt...
214
+ [2025-10-27 06:12:01][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001677721600.pt.
215
+ [2025-10-27 06:12:01][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.455]
216
+ [2025-10-27 06:12:57][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:35:51] [ETA: 0:36:33] [loss: 4.918] [tokens/s: 179333.081] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
217
+ [2025-10-27 06:13:53][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:36:47] [ETA: 0:34:25] [loss: 4.933] [tokens/s: 200273.657] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
218
+ [2025-10-27 06:13:53][train:194][INFO] Running validation...
219
+ [2025-10-27 06:15:53][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 9407.595] [val/train_update_time: 4586.732] [val/loss: 4.964] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.848] [val/val_tokens_per_second: 341767.290] [val/loss_avg_len_2048: 4.964] [val/perplexity_len_2048: 143.139] [val/loss_avg_len_1024: 4.980] [val/perplexity_len_1024: 145.479] [val/loss_avg_len_512: 5.011] [val/perplexity_len_512: 150.109]
220
+ [2025-10-27 06:16:49][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:39:43] [ETA: 0:32:42] [loss: 4.960] [tokens/s: 179522.146] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
221
+ [2025-10-27 06:17:45][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:40:39] [ETA: 0:30:36] [loss: 4.935] [tokens/s: 200486.794] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
222
+ [2025-10-27 06:17:45][train:194][INFO] Running validation...
223
+ [2025-10-27 06:19:45][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 9639.468] [val/train_update_time: 4698.514] [val/loss: 4.960] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.453] [val/val_tokens_per_second: 342896.486] [val/loss_avg_len_2048: 4.960] [val/perplexity_len_2048: 142.608] [val/loss_avg_len_1024: 4.976] [val/perplexity_len_1024: 144.943] [val/loss_avg_len_512: 5.008] [val/perplexity_len_512: 149.570]
224
+ [2025-10-27 06:20:41][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:43:34] [ETA: 0:28:52] [loss: 4.998] [tokens/s: 179748.472] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
225
+ [2025-10-27 06:20:41][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9814.949] [train_eval/train_update_time: 4754.398] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.958] [train_eval/perplexity_len_2048: 142.359] [train_eval/loss_avg_len_1024: 4.970] [train_eval/perplexity_len_1024: 144.052] [train_eval/loss_avg_len_512: 4.999] [train_eval/perplexity_len_512: 148.333]
226
+ [2025-10-27 06:21:37][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:44:30] [ETA: 0:26:46] [loss: 4.972] [tokens/s: 200607.760] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
227
+ [2025-10-27 06:21:37][train:194][INFO] Running validation...
228
+ [2025-10-27 06:23:37][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 9870.948] [val/train_update_time: 4810.273] [val/loss: 4.957] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 120.026] [val/val_tokens_per_second: 341259.755] [val/loss_avg_len_2048: 4.957] [val/perplexity_len_2048: 142.219] [val/loss_avg_len_1024: 4.974] [val/perplexity_len_1024: 144.551] [val/loss_avg_len_512: 5.005] [val/perplexity_len_512: 149.167]
229
+ [2025-10-27 06:24:33][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:47:27] [ETA: 0:25:01] [loss: 4.915] [tokens/s: 179751.019] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
230
+ [2025-10-27 06:25:29][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:48:23] [ETA: 0:22:57] [loss: 4.952] [tokens/s: 200702.174] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
231
+ [2025-10-27 06:25:29][train:194][INFO] Running validation...
232
+ [2025-10-27 06:27:28][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 10103.002] [val/train_update_time: 4922.057] [val/loss: 4.955] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.537] [val/val_tokens_per_second: 342654.561] [val/loss_avg_len_2048: 4.955] [val/perplexity_len_2048: 141.897] [val/loss_avg_len_1024: 4.971] [val/perplexity_len_1024: 144.224] [val/loss_avg_len_512: 5.003] [val/perplexity_len_512: 148.831]
233
+ [2025-10-27 06:28:24][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:51:18] [ETA: 0:21:10] [loss: 4.991] [tokens/s: 179911.055] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
234
+ [2025-10-27 06:29:20][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:52:14] [ETA: 0:19:08] [loss: 4.921] [tokens/s: 200908.852] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
235
+ [2025-10-27 06:29:20][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10334.591] [train_eval/train_update_time: 5033.850] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.958] [train_eval/perplexity_len_2048: 142.315] [train_eval/loss_avg_len_1024: 4.973] [train_eval/perplexity_len_1024: 144.501] [train_eval/loss_avg_len_512: 5.003] [train_eval/perplexity_len_512: 148.803]
236
+ [2025-10-27 06:29:20][train:194][INFO] Running validation...
237
+ [2025-10-27 06:31:20][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 10334.591] [val/train_update_time: 5033.850] [val/loss: 4.953] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.824] [val/val_tokens_per_second: 341834.050] [val/loss_avg_len_2048: 4.953] [val/perplexity_len_2048: 141.669] [val/loss_avg_len_1024: 4.970] [val/perplexity_len_1024: 143.999] [val/loss_avg_len_512: 5.001] [val/perplexity_len_512: 148.607]
238
+ [2025-10-27 06:31:20][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001887436800.pt...
239
+ [2025-10-27 06:31:21][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001887436800.pt.
240
+ [2025-10-27 06:31:21][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.455]
241
+ [2025-10-27 06:32:17][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:55:10] [ETA: 0:17:19] [loss: 4.945] [tokens/s: 179954.997] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
242
+ [2025-10-27 06:33:13][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 2:56:06] [ETA: 0:15:18] [loss: 4.987] [tokens/s: 200816.583] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
243
+ [2025-10-27 06:33:13][train:194][INFO] Running validation...
244
+ [2025-10-27 06:35:12][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 10566.928] [val/train_update_time: 5145.646] [val/loss: 4.952] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.309] [val/val_tokens_per_second: 343310.231] [val/loss_avg_len_2048: 4.952] [val/perplexity_len_2048: 141.487] [val/loss_avg_len_1024: 4.969] [val/perplexity_len_1024: 143.816] [val/loss_avg_len_512: 5.000] [val/perplexity_len_512: 148.421]
245
+ [2025-10-27 06:36:08][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 2:59:02] [ETA: 0:13:28] [loss: 4.959] [tokens/s: 180041.066] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
246
+ [2025-10-27 06:37:04][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 2:59:58] [ETA: 0:11:29] [loss: 4.918] [tokens/s: 200844.078] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
247
+ [2025-10-27 06:37:04][train:194][INFO] Running validation...
248
+ [2025-10-27 06:39:04][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 10798.265] [val/train_update_time: 5257.438] [val/loss: 4.951] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.871] [val/val_tokens_per_second: 341699.756] [val/loss_avg_len_2048: 4.951] [val/perplexity_len_2048: 141.375] [val/loss_avg_len_1024: 4.968] [val/perplexity_len_1024: 143.702] [val/loss_avg_len_512: 4.999] [val/perplexity_len_512: 148.304]
249
+ [2025-10-27 06:40:00][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 3:02:54] [ETA: 0:09:37] [loss: 4.951] [tokens/s: 179977.186] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
250
+ [2025-10-27 06:40:00][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10974.142] [train_eval/train_update_time: 5313.315] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.949] [train_eval/perplexity_len_2048: 140.996] [train_eval/loss_avg_len_1024: 4.966] [train_eval/perplexity_len_1024: 143.434] [train_eval/loss_avg_len_512: 4.994] [train_eval/perplexity_len_512: 147.565]
251
+ [2025-10-27 06:40:56][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 3:03:50] [ETA: 0:07:39] [loss: 4.957] [tokens/s: 200874.865] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
252
+ [2025-10-27 06:40:56][train:194][INFO] Running validation...
253
+ [2025-10-27 06:42:56][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 11030.159] [val/train_update_time: 5369.214] [val/loss: 4.951] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.707] [val/val_tokens_per_second: 342168.051] [val/loss_avg_len_2048: 4.951] [val/perplexity_len_2048: 141.310] [val/loss_avg_len_1024: 4.967] [val/perplexity_len_1024: 143.636] [val/loss_avg_len_512: 4.999] [val/perplexity_len_512: 148.236]
254
+ [2025-10-27 06:43:52][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 3:06:45] [ETA: 0:05:46] [loss: 4.969] [tokens/s: 180022.960] [batches/s: 0.086] [MFU: 0.000] [TFLOPS: 0.000]
255
+ [2025-10-27 06:44:48][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 3:07:41] [ETA: 0:03:49] [loss: 4.954] [tokens/s: 200839.423] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
256
+ [2025-10-27 06:44:48][train:194][INFO] Running validation...
257
+ [2025-10-27 06:46:48][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 11261.909] [val/train_update_time: 5481.002] [val/loss: 4.951] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.949] [val/val_tokens_per_second: 341479.211] [val/loss_avg_len_2048: 4.951] [val/perplexity_len_2048: 141.282] [val/loss_avg_len_1024: 4.967] [val/perplexity_len_1024: 143.605] [val/loss_avg_len_512: 4.999] [val/perplexity_len_512: 148.203]
258
+ [2025-10-27 06:46:48][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"step": 209715200, "checkpoint/checkpoint_time": 0.564534884004388}
2
- {"step": 419430400, "checkpoint/checkpoint_time": 0.567023690964561}
3
- {"step": 629145600, "checkpoint/checkpoint_time": 0.5638952670269646}
4
- {"step": 838860800, "checkpoint/checkpoint_time": 0.4858379859942943}
5
- {"step": 1048576000, "checkpoint/checkpoint_time": 0.4735249990480952}
6
- {"step": 1258291200, "checkpoint/checkpoint_time": 0.5552213820046745}
7
- {"step": 1468006400, "checkpoint/checkpoint_time": 0.5709432679577731}
8
- {"step": 1677721600, "checkpoint/checkpoint_time": 0.5724008829565719}
9
- {"step": 1887436800, "checkpoint/checkpoint_time": 0.56779815396294}
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 0.45521786995232105}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 0.4552681630011648}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 0.45940230099949986}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 0.45928425301099196}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 0.4539535099756904}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 0.461745353997685}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 0.459751385031268}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 0.4552595349960029}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 0.45486548100598156}
metrics/jsonlines/norm.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/throughput.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl CHANGED
@@ -1,98 +1,98 @@
1
- {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 219.44772298802854, "train/update_time": 219.17982283898164, "train/lr": 0.0009000000000000001, "train/loss": 9.77186393737793, "train/global_grad_norm": 1.2174378633499146}
2
- {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 393.3033275610069, "train/update_time": 392.7191811740049, "train/lr": 0.0009997960964140947, "train/loss": 8.123015403747559, "train/global_grad_norm": 0.9983794093132019}
3
- {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 863.6309026500094, "train/update_time": 595.3849820840405, "train/lr": 0.0009990914580222257, "train/loss": 7.504764556884766, "train/global_grad_norm": 0.5670243501663208}
4
- {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 1014.3213793600444, "train/update_time": 745.8892338059959, "train/lr": 0.0009978842768382998, "train/loss": 7.151111602783203, "train/global_grad_norm": 0.44631311297416687}
5
- {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 1483.1021124350373, "train/update_time": 908.4019459009869, "train/lr": 0.0009961757683914405, "train/loss": 6.869814872741699, "train/global_grad_norm": 0.3835476338863373}
6
- {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 1633.8735543090152, "train/update_time": 1058.99380204099, "train/lr": 0.00099396765300483, "train/loss": 6.582631587982178, "train/global_grad_norm": 0.342731237411499}
7
- {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 2102.5612953370437, "train/update_time": 1209.294771298999, "train/lr": 0.0009912621540634887, "train/loss": 6.3764543533325195, "train/global_grad_norm": 0.31251585483551025}
8
- {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 2296.034582130029, "train/update_time": 1402.5742697739624, "train/lr": 0.000988061995775515, "train/loss": 6.187304496765137, "train/global_grad_norm": 0.457292377948761}
9
- {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 2738.4253872900154, "train/update_time": 1568.2938627917902, "train/lr": 0.0009843704004290394, "train/loss": 6.000740051269531, "train/global_grad_norm": 0.45764341950416565}
10
- {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 2950.672681943048, "train/update_time": 1780.2358716417802, "train/lr": 0.0009801910851476522, "train/loss": 5.888554573059082, "train/global_grad_norm": 0.4506590664386749}
11
- {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 3388.6962326810462, "train/update_time": 1974.5968189326813, "train/lr": 0.0009755282581475768, "train/loss": 5.764591693878174, "train/global_grad_norm": 0.38608840107917786}
12
- {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 3595.7228351200465, "train/update_time": 2181.321583448793, "train/lr": 0.0009703866145003512, "train/loss": 5.629734039306641, "train/global_grad_norm": 0.6687659025192261}
13
- {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 4033.824217539048, "train/update_time": 2393.338223681785, "train/lr": 0.0009647713314052896, "train/loss": 5.562553882598877, "train/global_grad_norm": 0.46099352836608887}
14
- {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 4217.221132909006, "train/update_time": 2576.439015968761, "train/lr": 0.0009586880629764817, "train/loss": 5.487265586853027, "train/global_grad_norm": 0.7035791873931885}
15
- {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 4684.9639007470105, "train/update_time": 2788.3531909267767, "train/lr": 0.0009521429345495787, "train/loss": 5.383974075317383, "train/global_grad_norm": 0.5269623398780823}
16
- {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 4838.394258918008, "train/update_time": 2941.477261595719, "train/lr": 0.0009451425365140996, "train/loss": 5.3296051025390625, "train/global_grad_norm": 0.3451927602291107}
17
- {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 5307.577416299027, "train/update_time": 3118.180899302766, "train/lr": 0.000937693917677468, "train/loss": 5.232334136962891, "train/global_grad_norm": 0.49183663725852966}
18
- {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 5457.476125103014, "train/update_time": 3267.897231984767, "train/lr": 0.0009298045781674596, "train/loss": 5.203054904937744, "train/global_grad_norm": 0.3954695463180542}
19
- {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 5926.165717172029, "train/update_time": 3418.016206740809, "train/lr": 0.0009214824618802108, "train/loss": 5.189924716949463, "train/global_grad_norm": 0.7796792984008789}
20
- {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 6104.984292554029, "train/update_time": 3596.637184852676, "train/lr": 0.000912735948481387, "train/loss": 5.086824893951416, "train/global_grad_norm": 0.4062754809856415}
21
- {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 6550.405725484015, "train/update_time": 3750.722188989632, "train/lr": 0.0009035738449685707, "train/loss": 5.0450758934021, "train/global_grad_norm": 0.5696613788604736}
22
- {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 6762.651720013004, "train/update_time": 3962.670169218676, "train/lr": 0.0008940053768033609, "train/loss": 5.003251075744629, "train/global_grad_norm": 0.4241951107978821}
23
- {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 7201.653420041024, "train/update_time": 4147.277713002753, "train/lr": 0.0008840401786221159, "train/loss": 4.949978828430176, "train/global_grad_norm": 0.48395395278930664}
24
- {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 7413.931084726995, "train/update_time": 4359.249880091811, "train/lr": 0.0008736882845346905, "train/loss": 4.89894962310791, "train/global_grad_norm": 0.5247655510902405}
25
- {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 7847.669167418033, "train/update_time": 4567.680281858717, "train/lr": 0.0008629601180209381, "train/loss": 4.89341926574707, "train/global_grad_norm": 0.5477429032325745}
26
- {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 8040.308323490026, "train/update_time": 4760.0058897937415, "train/lr": 0.0008518664814351503, "train/loss": 4.842683792114258, "train/global_grad_norm": 0.5116428732872009}
27
- {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 8496.36842228804, "train/update_time": 4972.021029465657, "train/lr": 0.0008404185451290017, "train/loss": 4.826287746429443, "train/global_grad_norm": 0.5284518599510193}
28
- {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 8661.466409554007, "train/update_time": 5136.805537326669, "train/lr": 0.0008286278362039527, "train/loss": 4.773619174957275, "train/global_grad_norm": 0.5465279221534729}
29
- {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 9131.361868123, "train/update_time": 5328.725031249633, "train/lr": 0.0008165062269044352, "train/loss": 4.738215446472168, "train/global_grad_norm": 0.43803009390830994}
30
- {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 9281.588875080051, "train/update_time": 5478.766576015565, "train/lr": 0.0008040659226635089, "train/loss": 4.710612773895264, "train/global_grad_norm": 0.5540741086006165}
31
- {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 9750.55400532001, "train/update_time": 5628.910955499508, "train/lr": 0.0007913194498130252, "train/loss": 4.7304229736328125, "train/global_grad_norm": 0.45888814330101013}
32
- {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 9914.599941182008, "train/update_time": 5792.780345006497, "train/lr": 0.000778279642970672, "train/loss": 4.656280994415283, "train/global_grad_norm": 0.4830974340438843}
33
- {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 10369.58782212704, "train/update_time": 5942.496549489384, "train/lr": 0.0007649596321166025, "train/loss": 4.677618026733398, "train/global_grad_norm": 0.7399375438690186}
34
- {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 10574.152368383016, "train/update_time": 6146.868059828412, "train/lr": 0.0007513728293726579, "train/loss": 4.6279706954956055, "train/global_grad_norm": 0.4657606780529022}
35
- {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 11014.551226956013, "train/update_time": 6321.31326986436, "train/lr": 0.0007375329154974975, "train/loss": 4.613265037536621, "train/global_grad_norm": 0.7494572997093201}
36
- {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 11226.902240763011, "train/update_time": 6533.353484157473, "train/lr": 0.0007234538261112341, "train/loss": 4.532633304595947, "train/global_grad_norm": 0.5424163937568665}
37
- {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 11661.993830877997, "train/update_time": 6734.094668152509, "train/lr": 0.0007091497376634464, "train/loss": 4.551605224609375, "train/global_grad_norm": 0.5107011795043945}
38
- {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 11862.473320519028, "train/update_time": 6934.267906721507, "train/lr": 0.0006946350531586958, "train/loss": 4.527378559112549, "train/global_grad_norm": 0.5982224941253662}
39
- {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 12308.825131079007, "train/update_time": 7146.272098292538, "train/lr": 0.0006799243876539214, "train/loss": 4.523767948150635, "train/global_grad_norm": 0.4483889043331146}
40
- {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 12483.889439035032, "train/update_time": 7321.039978560642, "train/lr": 0.0006650325535423166, "train/loss": 4.428459644317627, "train/global_grad_norm": 0.6677308678627014}
41
- {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 12954.41088695702, "train/update_time": 7525.117961935524, "train/lr": 0.0006499745456385053, "train/loss": 4.447395324707031, "train/global_grad_norm": 0.5507714152336121}
42
- {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 13104.464458574017, "train/update_time": 7674.994709033519, "train/lr": 0.0006347655260800339, "train/loss": 4.442117214202881, "train/global_grad_norm": 0.5809928774833679}
43
- {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 13572.86144918704, "train/update_time": 7837.980615513574, "train/lr": 0.0006194208090603844, "train/loss": 4.436192989349365, "train/global_grad_norm": 0.5325289368629456}
44
- {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 13721.99477496103, "train/update_time": 7986.941047978529, "train/lr": 0.0006039558454088796, "train/loss": 4.456014633178711, "train/global_grad_norm": 0.6674598455429077}
45
- {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 14189.86244938703, "train/update_time": 8136.602942996542, "train/lr": 0.0005883862070330078, "train/loss": 4.393319606781006, "train/global_grad_norm": 0.5084896683692932}
46
- {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 14381.773025262053, "train/update_time": 8328.335106454615, "train/lr": 0.0005727275712388317, "train/loss": 4.356149673461914, "train/global_grad_norm": 0.5596035718917847}
47
- {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 14823.617820873042, "train/update_time": 8492.146036952618, "train/lr": 0.0005569957049452703, "train/loss": 4.362613677978516, "train/global_grad_norm": 0.5118389129638672}
48
- {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 15035.86255901301, "train/update_time": 8704.087580051564, "train/lr": 0.0005412064488081482, "train/loss": 4.340407371520996, "train/global_grad_norm": 0.5723304152488708}
49
- {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 15473.11414746102, "train/update_time": 8896.564982390497, "train/lr": 0.0005253757012699972, "train/loss": 4.335324287414551, "train/global_grad_norm": 0.5161768198013306}
50
- {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 15681.829017209006, "train/update_time": 9104.969846359396, "train/lr": 0.0005095194025516734, "train/loss": 4.303962707519531, "train/global_grad_norm": 0.5649275779724121}
51
- {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 16119.577212364005, "train/update_time": 9316.950864796352, "train/lr": 0.0004936535186019053, "train/loss": 4.29068660736084, "train/global_grad_norm": 0.610883355140686}
52
- {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 16303.791181104025, "train/update_time": 9500.859303092468, "train/lr": 0.00047779402502093696, "train/loss": 4.2827653884887695, "train/global_grad_norm": 0.5993229150772095}
53
- {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 16770.228860588046, "train/update_time": 9712.85516443639, "train/lr": 0.0004619568909744525, "train/loss": 4.242770671844482, "train/global_grad_norm": 0.49561238288879395}
54
- {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 16924.293944418023, "train/update_time": 9866.63232135144, "train/lr": 0.00044615806311398067, "train/loss": 4.251742839813232, "train/global_grad_norm": 0.6209728717803955}
55
- {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 17393.370531369, "train/update_time": 10044.968985380197, "train/lr": 0.0004304134495199673, "train/loss": 4.189674377441406, "train/global_grad_norm": 0.5244336128234863}
56
- {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 17543.088614015025, "train/update_time": 10194.5136101502, "train/lr": 0.0004147389036836882, "train/loss": 4.233956813812256, "train/global_grad_norm": 0.5448895692825317}
57
- {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 18010.892701787, "train/update_time": 10344.159968029184, "train/lr": 0.0003991502085441259, "train/loss": 4.165826797485352, "train/global_grad_norm": 0.6113353967666626}
58
- {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 18186.98511047603, "train/update_time": 10520.074710074114, "train/lr": 0.0003836630605958888, "train/loss": 4.22281551361084, "train/global_grad_norm": 0.5304960012435913}
59
- {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 18632.073160929023, "train/update_time": 10671.874259623233, "train/lr": 0.00036829305408417155, "train/loss": 4.211955547332764, "train/global_grad_norm": 0.5258879065513611}
60
- {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 18844.378502471023, "train/update_time": 10883.880087946192, "train/lr": 0.000353055665302672, "train/loss": 4.207913875579834, "train/global_grad_norm": 0.6525558829307556}
61
- {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 19284.370175764023, "train/update_time": 11066.854351600225, "train/lr": 0.0003379662370102746, "train/loss": 4.168120861053467, "train/global_grad_norm": 0.5861026644706726}
62
- {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 19496.670543703018, "train/update_time": 11278.836069991172, "train/lr": 0.00032303996298219405, "train/loss": 4.139932632446289, "train/global_grad_norm": 0.5590474605560303}
63
- {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 19930.58125682705, "train/update_time": 11486.317253635323, "train/lr": 0.00030829187271113034, "train/loss": 4.151315689086914, "train/global_grad_norm": 0.48951366543769836}
64
- {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 20124.23348677403, "train/update_time": 11679.664113470295, "train/lr": 0.0002937368162738445, "train/loss": 4.140133857727051, "train/global_grad_norm": 0.5501073002815247}
65
- {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 20578.88107217505, "train/update_time": 11891.677284248348, "train/lr": 0.0002793894493783894, "train/loss": 4.110686302185059, "train/global_grad_norm": 0.859492838382721}
66
- {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 20744.93009287404, "train/update_time": 12057.427525710431, "train/lr": 0.00026526421860705474, "train/loss": 4.138786792755127, "train/global_grad_norm": 0.5685120224952698}
67
- {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 21214.90017826401, "train/update_time": 12251.372320065391, "train/lr": 0.0002513753468698824, "train/loss": 4.0760040283203125, "train/global_grad_norm": 0.5391287207603455}
68
- {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 21364.400057210005, "train/update_time": 12400.716242285387, "train/lr": 0.00023773681908340283, "train/loss": 4.092895984649658, "train/global_grad_norm": 0.5448238849639893}
69
- {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 21831.916811336007, "train/update_time": 12550.552763604384, "train/lr": 0.00022436236808900823, "train/loss": 4.098409652709961, "train/global_grad_norm": 0.5497803092002869}
70
- {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 21992.386447268014, "train/update_time": 12710.86209141434, "train/lr": 0.00021126546082514682, "train/loss": 4.090141773223877, "train/global_grad_norm": 0.4459550082683563}
71
- {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 22450.115988292033, "train/update_time": 12860.017111644556, "train/lr": 0.00019845928476725522, "train/loss": 4.094099044799805, "train/global_grad_norm": 0.48487746715545654}
72
- {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 22653.044829952996, "train/update_time": 13062.762519414595, "train/lr": 0.0001859567346490913, "train/loss": 4.064176082611084, "train/global_grad_norm": 0.6045382022857666}
73
- {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 23093.526105974044, "train/update_time": 13235.4935254085, "train/lr": 0.00017377039947882782, "train/loss": 4.084225177764893, "train/global_grad_norm": 0.42127346992492676}
74
- {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 23305.888173635, "train/update_time": 13447.544946793583, "train/lr": 0.00016191254986299043, "train/loss": 4.059417724609375, "train/global_grad_norm": 0.43410712480545044}
75
- {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 23741.714356134005, "train/update_time": 13647.155335442629, "train/lr": 0.00015039512565099468, "train/loss": 4.048225402832031, "train/global_grad_norm": 0.4433448016643524}
76
- {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 23943.319328956015, "train/update_time": 13848.439878219564, "train/lr": 0.00013922972391273224, "train/loss": 4.010053634643555, "train/global_grad_norm": 0.5291637182235718}
77
- {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 24389.08542349504, "train/update_time": 14060.475522507564, "train/lr": 0.00012842758726130281, "train/loss": 4.079256534576416, "train/global_grad_norm": 0.39220452308654785}
78
- {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 24564.46875228104, "train/update_time": 14235.556923599623, "train/lr": 0.00011799959253265679, "train/loss": 4.007717609405518, "train/global_grad_norm": 0.3393424153327942}
79
- {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 25035.12217405904, "train/update_time": 14441.209321577568, "train/lr": 0.00010795623983354214, "train/loss": 4.031998634338379, "train/global_grad_norm": 0.47528231143951416}
80
- {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 25184.550976835017, "train/update_time": 14590.463811040623, "train/lr": 9.830764196878872e-05, "train/loss": 4.010214328765869, "train/global_grad_norm": 0.38191747665405273}
81
- {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 25429.942887034034, "train/update_time": 14679.69327192055, "train/lr": 8.906351425856951e-05, "train/loss": 3.989518642425537, "train/global_grad_norm": 0.3788834810256958}
82
- {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 25519.29751138203, "train/update_time": 14768.911977301526, "train/lr": 8.02331647558977e-05, "train/loss": 3.9995009899139404, "train/global_grad_norm": 0.37431642413139343}
83
- {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 25722.537228438014, "train/update_time": 14858.13312200259, "train/lr": 7.182548487420554e-05, "train/loss": 4.0348801612854, "train/global_grad_norm": 0.39566728472709656}
84
- {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 25811.900586274045, "train/update_time": 14947.35764870455, "train/lr": 6.384894043444556e-05, "train/loss": 3.9766266345977783, "train/global_grad_norm": 0.40857023000717163}
85
- {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 26014.853294235014, "train/update_time": 15036.594645244593, "train/lr": 5.6311563140726166e-05, "train/loss": 4.0503740310668945, "train/global_grad_norm": 0.33130380511283875}
86
- {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 26104.19660033105, "train/update_time": 15125.80504047859, "train/lr": 4.922094249306547e-05, "train/loss": 4.03305196762085, "train/global_grad_norm": 0.3090103566646576}
87
- {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 26307.15804652701, "train/update_time": 15215.028811053548, "train/lr": 4.2584218145409916e-05, "train/loss": 3.975665807723999, "train/global_grad_norm": 0.2927410304546356}
88
- {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 26396.506954086013, "train/update_time": 15304.24179791665, "train/lr": 3.6408072716606236e-05, "train/loss": 3.989638090133667, "train/global_grad_norm": 0.2772652506828308}
89
- {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 26599.493567029014, "train/update_time": 15393.460775722808, "train/lr": 3.069872506157217e-05, "train/loss": 4.053800582885742, "train/global_grad_norm": 0.27483704686164856}
90
- {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 26688.83935572201, "train/update_time": 15482.672627257823, "train/lr": 2.5461924009435368e-05, "train/loss": 3.9634201526641846, "train/global_grad_norm": 0.2686466574668884}
91
- {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 26892.36250021303, "train/update_time": 15571.891412241792, "train/lr": 2.0702942574950812e-05, "train/loss": 4.001792907714844, "train/global_grad_norm": 0.2545761168003082}
92
- {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 26981.709376943007, "train/update_time": 15661.105290838808, "train/lr": 1.642657264902142e-05, "train/loss": 4.026549816131592, "train/global_grad_norm": 0.2649659812450409}
93
- {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 27184.824638435035, "train/update_time": 15750.32617144374, "train/lr": 1.2637120173670358e-05, "train/loss": 4.013813495635986, "train/global_grad_norm": 0.2429317831993103}
94
- {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 27274.170978915005, "train/update_time": 15839.539326993749, "train/lr": 9.338400806321978e-06, "train/loss": 3.972702741622925, "train/global_grad_norm": 0.2406640350818634}
95
- {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 27477.10213337402, "train/update_time": 15928.765013526718, "train/lr": 6.533736077758867e-06, "train/loss": 3.987436532974243, "train/global_grad_norm": 0.23872317373752594}
96
- {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 27566.45336975, "train/update_time": 16017.985979812802, "train/lr": 4.2259500476214406e-06, "train/loss": 3.987391471862793, "train/global_grad_norm": 0.2312818020582199}
97
- {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 27769.478837262024, "train/update_time": 16107.212218919827, "train/lr": 2.417366460819359e-06, "train/loss": 4.015570640563965, "train/global_grad_norm": 0.22287066280841827}
98
- {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 27858.825974697014, "train/update_time": 16196.423335834814, "train/lr": 1.1098064077174619e-06, "train/loss": 3.98713755607605, "train/global_grad_norm": 0.21902944147586823}
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 59.740394167019986, "train/update_time": 59.53266526403604, "train/lr": 0.0009000000000000001, "train/loss": 9.925932884216309, "train/global_grad_norm": 1.0933681726455688}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 115.77270240103826, "train/update_time": 115.42706217308296, "train/lr": 0.0009997960964140947, "train/loss": 8.114099502563477, "train/global_grad_norm": 0.7164918780326843}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 292.0277106779977, "train/update_time": 171.32016484509222, "train/lr": 0.0009990914580222257, "train/loss": 7.684152603149414, "train/global_grad_norm": 0.3018423914909363}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 348.0203448670218, "train/update_time": 227.1939903421444, "train/lr": 0.0009978842768382998, "train/loss": 7.448395729064941, "train/global_grad_norm": 0.2099468857049942}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 523.9415633900207, "train/update_time": 283.0836244261591, "train/lr": 0.0009961757683914405, "train/loss": 7.294851779937744, "train/global_grad_norm": 0.23000206053256989}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 579.9503910699859, "train/update_time": 338.96602103614714, "train/lr": 0.00099396765300483, "train/loss": 7.111999988555908, "train/global_grad_norm": 0.2716911733150482}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 756.5241781410296, "train/update_time": 394.8385707241832, "train/lr": 0.0009912621540634887, "train/loss": 6.983415126800537, "train/global_grad_norm": 0.21857409179210663}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 812.5387418420287, "train/update_time": 450.7242119802977, "train/lr": 0.000988061995775515, "train/loss": 6.815765380859375, "train/global_grad_norm": 0.18543004989624023}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 988.3924220129848, "train/update_time": 506.5995626472868, "train/lr": 0.0009843704004290394, "train/loss": 6.659385681152344, "train/global_grad_norm": 0.20697954297065735}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1044.3968842840404, "train/update_time": 562.4800082092406, "train/lr": 0.0009801910851476522, "train/loss": 6.568438529968262, "train/global_grad_norm": 0.31236547231674194}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1221.4379238740075, "train/update_time": 618.3656786112697, "train/lr": 0.0009755282581475768, "train/loss": 6.480067729949951, "train/global_grad_norm": 0.3563300371170044}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1277.4677367979893, "train/update_time": 674.2627646423061, "train/lr": 0.0009703866145003512, "train/loss": 6.361207008361816, "train/global_grad_norm": 0.28948819637298584}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1452.9732034530025, "train/update_time": 730.170422351337, "train/lr": 0.0009647713314052896, "train/loss": 6.323390483856201, "train/global_grad_norm": 0.4304763078689575}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1509.0055129660177, "train/update_time": 786.0731882582186, "train/lr": 0.0009586880629764817, "train/loss": 6.252005577087402, "train/global_grad_norm": 0.22148849070072174}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1685.596672433021, "train/update_time": 841.9682118772762, "train/lr": 0.0009521429345495787, "train/loss": 6.138335704803467, "train/global_grad_norm": 0.29878920316696167}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1741.6182956020348, "train/update_time": 897.8648126212647, "train/lr": 0.0009451425365140996, "train/loss": 6.105388641357422, "train/global_grad_norm": 0.24416939914226532}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1918.145444536989, "train/update_time": 953.7598241912201, "train/lr": 0.000937693917677468, "train/loss": 6.023434162139893, "train/global_grad_norm": 0.3608187139034271}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1974.1614223060315, "train/update_time": 1009.6508660353138, "train/lr": 0.0009298045781674596, "train/loss": 5.995698928833008, "train/global_grad_norm": 0.2951180636882782}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2149.7959963100147, "train/update_time": 1065.5460091893328, "train/lr": 0.0009214824618802108, "train/loss": 5.970148086547852, "train/global_grad_norm": 0.5581636428833008}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2205.8133425950073, "train/update_time": 1121.4434441442718, "train/lr": 0.000912735948481387, "train/loss": 5.88069486618042, "train/global_grad_norm": 0.35760968923568726}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2381.4569776499993, "train/update_time": 1177.318929851288, "train/lr": 0.0009035738449685707, "train/loss": 5.837252616882324, "train/global_grad_norm": 0.32113462686538696}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2437.459500588011, "train/update_time": 1233.201419278339, "train/lr": 0.0008940053768033609, "train/loss": 5.823047637939453, "train/global_grad_norm": 0.532181441783905}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2613.3590940189897, "train/update_time": 1289.0799032752984, "train/lr": 0.0008840401786221159, "train/loss": 5.743273735046387, "train/global_grad_norm": 0.28300613164901733}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2669.367173767998, "train/update_time": 1344.9662226063083, "train/lr": 0.0008736882845346905, "train/loss": 5.686018466949463, "train/global_grad_norm": 0.4552343785762787}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2845.8649667550344, "train/update_time": 1400.8440078733838, "train/lr": 0.0008629601180209381, "train/loss": 5.690680980682373, "train/global_grad_norm": 0.40284600853919983}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2901.8806591850007, "train/update_time": 1456.7321346284007, "train/lr": 0.0008518664814351503, "train/loss": 5.6336669921875, "train/global_grad_norm": 0.35505881905555725}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3077.9571933770203, "train/update_time": 1512.6155867513735, "train/lr": 0.0008404185451290017, "train/loss": 5.614615440368652, "train/global_grad_norm": 0.42291319370269775}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3133.951200322015, "train/update_time": 1568.5041498313076, "train/lr": 0.0008286278362039527, "train/loss": 5.560991287231445, "train/global_grad_norm": 0.3632428050041199}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3309.8123759650043, "train/update_time": 1624.3993004413205, "train/lr": 0.0008165062269044352, "train/loss": 5.52689266204834, "train/global_grad_norm": 0.5304650068283081}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3365.815582766023, "train/update_time": 1680.2926608613343, "train/lr": 0.0008040659226635089, "train/loss": 5.491298198699951, "train/global_grad_norm": 0.2989009618759155}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3542.1138300999883, "train/update_time": 1736.1879310192307, "train/lr": 0.0007913194498130252, "train/loss": 5.506411552429199, "train/global_grad_norm": 0.3645610511302948}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3598.114992738003, "train/update_time": 1792.082390839234, "train/lr": 0.000778279642970672, "train/loss": 5.436527729034424, "train/global_grad_norm": 0.3181352913379669}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3773.9975122280302, "train/update_time": 1847.994289233291, "train/lr": 0.0007649596321166025, "train/loss": 5.447015762329102, "train/global_grad_norm": 0.5075618624687195}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3829.9891319620074, "train/update_time": 1903.884795576334, "train/lr": 0.0007513728293726579, "train/loss": 5.411071300506592, "train/global_grad_norm": 0.485222190618515}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 4005.8882146670367, "train/update_time": 1959.7687862973544, "train/lr": 0.0007375329154974975, "train/loss": 5.39218282699585, "train/global_grad_norm": 0.42199867963790894}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4061.88242925104, "train/update_time": 2015.6589334552991, "train/lr": 0.0007234538261112341, "train/loss": 5.3130316734313965, "train/global_grad_norm": 0.35227981209754944}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4238.28049745399, "train/update_time": 2071.5370760894148, "train/lr": 0.0007091497376634464, "train/loss": 5.341176509857178, "train/global_grad_norm": 0.4442596137523651}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4294.288298399013, "train/update_time": 2127.428580871492, "train/lr": 0.0006946350531586958, "train/loss": 5.3172173500061035, "train/global_grad_norm": 0.3961981534957886}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4470.58636719099, "train/update_time": 2183.3099067094154, "train/lr": 0.0006799243876539214, "train/loss": 5.318506240844727, "train/global_grad_norm": 0.3649199604988098}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4526.608443362988, "train/update_time": 2239.197950529342, "train/lr": 0.0006650325535423166, "train/loss": 5.243042945861816, "train/global_grad_norm": 0.5602830052375793}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4703.79108641902, "train/update_time": 2295.093229125312, "train/lr": 0.0006499745456385053, "train/loss": 5.247615814208984, "train/global_grad_norm": 0.434451699256897}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4759.794200062985, "train/update_time": 2350.9869147563586, "train/lr": 0.0006347655260800339, "train/loss": 5.253141403198242, "train/global_grad_norm": 0.6098817586898804}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4936.540683488012, "train/update_time": 2406.895653022337, "train/lr": 0.0006194208090603844, "train/loss": 5.238434791564941, "train/global_grad_norm": 0.3986305594444275}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4992.560281209007, "train/update_time": 2462.802181679348, "train/lr": 0.0006039558454088796, "train/loss": 5.259368419647217, "train/global_grad_norm": 0.45497700572013855}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5168.203005189018, "train/update_time": 2518.7051250302466, "train/lr": 0.0005883862070330078, "train/loss": 5.205867767333984, "train/global_grad_norm": 0.6130543947219849}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5224.203345166985, "train/update_time": 2574.5941846003407, "train/lr": 0.0005727275712388317, "train/loss": 5.177216053009033, "train/global_grad_norm": 0.372676819562912}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5399.371034484007, "train/update_time": 2630.4843858853565, "train/lr": 0.0005569957049452703, "train/loss": 5.195985317230225, "train/global_grad_norm": 0.3776322305202484}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5455.381416677032, "train/update_time": 2686.3787583513767, "train/lr": 0.0005412064488081482, "train/loss": 5.169455528259277, "train/global_grad_norm": 0.48483920097351074}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5631.206694298016, "train/update_time": 2742.264746093366, "train/lr": 0.0005253757012699972, "train/loss": 5.1734089851379395, "train/global_grad_norm": 0.3823016583919525}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5687.226690375013, "train/update_time": 2798.1550497164135, "train/lr": 0.0005095194025516734, "train/loss": 5.148708343505859, "train/global_grad_norm": 0.33509281277656555}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5863.125213610998, "train/update_time": 2854.0412449103314, "train/lr": 0.0004936535186019053, "train/loss": 5.145965576171875, "train/global_grad_norm": 0.39432910084724426}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5919.139419794024, "train/update_time": 2909.933540413331, "train/lr": 0.00047779402502093696, "train/loss": 5.137781143188477, "train/global_grad_norm": 0.510867178440094}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6094.99239180499, "train/update_time": 2965.8256602562615, "train/lr": 0.0004619568909744525, "train/loss": 5.095966815948486, "train/global_grad_norm": 0.4091125428676605}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6151.005747379037, "train/update_time": 3021.723491590412, "train/lr": 0.00044615806311398067, "train/loss": 5.110124588012695, "train/global_grad_norm": 0.30919671058654785}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6327.491173934017, "train/update_time": 3077.6138937743963, "train/lr": 0.0004304134495199673, "train/loss": 5.058600425720215, "train/global_grad_norm": 0.3384024202823639}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6383.517213010986, "train/update_time": 3133.5136128164013, "train/lr": 0.0004147389036836882, "train/loss": 5.096649646759033, "train/global_grad_norm": 0.43362703919410706}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6560.036292392993, "train/update_time": 3189.418762697489, "train/lr": 0.0003991502085441259, "train/loss": 5.065718173980713, "train/global_grad_norm": 0.35753506422042847}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6616.055601358006, "train/update_time": 3245.322524867661, "train/lr": 0.0003836630605958888, "train/loss": 5.097409725189209, "train/global_grad_norm": 0.2560317814350128}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6792.385478265991, "train/update_time": 3301.2410182636813, "train/lr": 0.00036829305408417155, "train/loss": 5.073716640472412, "train/global_grad_norm": 0.33256691694259644}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6848.3877059380175, "train/update_time": 3357.13471879164, "train/lr": 0.000353055665302672, "train/loss": 5.076870918273926, "train/global_grad_norm": 0.3488343060016632}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 7025.611038778035, "train/update_time": 3413.0350541696534, "train/lr": 0.0003379662370102746, "train/loss": 5.060288429260254, "train/global_grad_norm": 0.2855152189731598}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 7081.627107631008, "train/update_time": 3468.931253584684, "train/lr": 0.00032303996298219405, "train/loss": 5.036261081695557, "train/global_grad_norm": 0.35105201601982117}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7257.753306786006, "train/update_time": 3524.844808488677, "train/lr": 0.00030829187271113034, "train/loss": 5.049121379852295, "train/global_grad_norm": 0.3490675091743469}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7313.765329002985, "train/update_time": 3580.7367782525835, "train/lr": 0.0002937368162738445, "train/loss": 5.05193567276001, "train/global_grad_norm": 0.3178498148918152}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7489.941271601012, "train/update_time": 3636.640745670593, "train/lr": 0.0002793894493783894, "train/loss": 5.022533416748047, "train/global_grad_norm": 0.29690152406692505}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7545.990838972037, "train/update_time": 3692.5671064984635, "train/lr": 0.00026526421860705474, "train/loss": 5.044463157653809, "train/global_grad_norm": 0.2601265013217926}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7722.533125129994, "train/update_time": 3748.472306053387, "train/lr": 0.0002513753468698824, "train/loss": 4.998659133911133, "train/global_grad_norm": 0.28301775455474854}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7778.552717441984, "train/update_time": 3804.3707672305172, "train/lr": 0.00023773681908340283, "train/loss": 5.004274368286133, "train/global_grad_norm": 0.22069884836673737}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7954.998239959998, "train/update_time": 3860.2436746915337, "train/lr": 0.00022436236808900823, "train/loss": 5.016230583190918, "train/global_grad_norm": 0.2676955461502075}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 8010.996608729009, "train/update_time": 3916.130362140422, "train/lr": 0.00021126546082514682, "train/loss": 5.002098560333252, "train/global_grad_norm": 0.29775163531303406}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8188.256527994003, "train/update_time": 3972.0219007354463, "train/lr": 0.00019845928476725522, "train/loss": 4.991973400115967, "train/global_grad_norm": 0.325958788394928}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8244.260649731033, "train/update_time": 4027.906490651425, "train/lr": 0.0001859567346490913, "train/loss": 4.999189376831055, "train/global_grad_norm": 0.22328363358974457}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8421.350605542015, "train/update_time": 4083.8140410054475, "train/lr": 0.00017377039947882782, "train/loss": 5.008745193481445, "train/global_grad_norm": 0.24172931909561157}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8477.364581940987, "train/update_time": 4139.6967294025235, "train/lr": 0.00016191254986299043, "train/loss": 5.006008625030518, "train/global_grad_norm": 0.24032117426395416}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8654.290324420028, "train/update_time": 4195.576348111499, "train/lr": 0.00015039512565099468, "train/loss": 4.995713233947754, "train/global_grad_norm": 0.20805297791957855}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8710.28124093404, "train/update_time": 4251.451832082472, "train/lr": 0.00013922972391273224, "train/loss": 4.950077056884766, "train/global_grad_norm": 0.2368009090423584}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 8886.364251475025, "train/update_time": 4307.327748217445, "train/lr": 0.00012842758726130281, "train/loss": 5.011436939239502, "train/global_grad_norm": 0.24461600184440613}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 8942.332405220019, "train/update_time": 4363.189962119563, "train/lr": 0.00011799959253265679, "train/loss": 4.935585021972656, "train/global_grad_norm": 0.21502895653247833}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 9118.950192205026, "train/update_time": 4419.082554180524, "train/lr": 0.00010795623983354214, "train/loss": 4.970619201660156, "train/global_grad_norm": 0.18588073551654816}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9174.941196507018, "train/update_time": 4474.963137161569, "train/lr": 9.830764196878872e-05, "train/loss": 4.93653678894043, "train/global_grad_norm": 0.21770106256008148}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9351.592722550035, "train/update_time": 4530.845779778552, "train/lr": 8.906351425856951e-05, "train/loss": 4.917532920837402, "train/global_grad_norm": 0.20093698799610138}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9407.59549859399, "train/update_time": 4586.732141316519, "train/lr": 8.02331647558977e-05, "train/loss": 4.933161735534668, "train/global_grad_norm": 0.1962786465883255}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9583.465221386985, "train/update_time": 4642.629586387542, "train/lr": 7.182548487420554e-05, "train/loss": 4.9604034423828125, "train/global_grad_norm": 0.16423940658569336}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9639.468442777987, "train/update_time": 4698.514196849486, "train/lr": 6.384894043444556e-05, "train/loss": 4.935092926025391, "train/global_grad_norm": 0.17330293357372284}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9814.94877268601, "train/update_time": 4754.397899497591, "train/lr": 5.6311563140726166e-05, "train/loss": 4.997830390930176, "train/global_grad_norm": 0.16490045189857483}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 9870.947561467998, "train/update_time": 4810.2732713416335, "train/lr": 4.922094249306547e-05, "train/loss": 4.9721550941467285, "train/global_grad_norm": 0.16293860971927643}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 10047.000041833031, "train/update_time": 4866.164181352709, "train/lr": 4.2584218145409916e-05, "train/loss": 4.91515588760376, "train/global_grad_norm": 0.1622975766658783}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 10103.00234679901, "train/update_time": 4922.056656240777, "train/lr": 3.6408072716606236e-05, "train/loss": 4.9515910148620605, "train/global_grad_norm": 0.16384997963905334}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10278.560454143037, "train/update_time": 4977.952506872767, "train/lr": 3.069872506157217e-05, "train/loss": 4.990846157073975, "train/global_grad_norm": 0.15534919500350952}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10334.590985039016, "train/update_time": 5033.849530185806, "train/lr": 2.5461924009435368e-05, "train/loss": 4.921456336975098, "train/global_grad_norm": 0.16249564290046692}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10510.921276927984, "train/update_time": 5089.760406139714, "train/lr": 2.0702942574950812e-05, "train/loss": 4.945437431335449, "train/global_grad_norm": 0.15427514910697937}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10566.927748315036, "train/update_time": 5145.645859024662, "train/lr": 1.642657264902142e-05, "train/loss": 4.986889839172363, "train/global_grad_norm": 0.14873239398002625}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 10742.247933529026, "train/update_time": 5201.534603161563, "train/lr": 1.2637120173670358e-05, "train/loss": 4.958912372589111, "train/global_grad_norm": 0.135355144739151}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 10798.264622143994, "train/update_time": 5257.438326367526, "train/lr": 9.338400806321978e-06, "train/loss": 4.918151378631592, "train/global_grad_norm": 0.13301731646060944}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 10974.142334740027, "train/update_time": 5313.31503523147, "train/lr": 6.533736077758867e-06, "train/loss": 4.9513726234436035, "train/global_grad_norm": 0.13848066329956055}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 11030.159157333022, "train/update_time": 5369.213506219559, "train/lr": 4.2259500476214406e-06, "train/loss": 4.957179069519043, "train/global_grad_norm": 0.14191794395446777}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 11205.89884431701, "train/update_time": 5425.1117221594905, "train/lr": 2.417366460819359e-06, "train/loss": 4.969225883483887, "train/global_grad_norm": 0.14331136643886566}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 11261.909377036034, "train/update_time": 5481.002296196413, "train/lr": 1.1098064077174619e-06, "train/loss": 4.953543663024902, "train/global_grad_norm": 0.12667855620384216}
metrics/jsonlines/train_eval.jsonl CHANGED
@@ -1,19 +1,19 @@
1
- {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1483.1021124350373, "train_eval/train_update_time": 908.4019459009869, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.24263097376177, "train_eval/perplexity_len_2048": 3799.523616442905, "train_eval/loss_avg_len_1024": 8.243660057830784, "train_eval/perplexity_len_1024": 3803.4356582311275, "train_eval/loss_avg_len_512": 8.244735129414474, "train_eval/perplexity_len_512": 3807.526822580914}
2
- {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2950.672681943048, "train_eval/train_update_time": 1780.2358716417802, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.304204806905582, "train_eval/perplexity_len_2048": 546.866550733703, "train_eval/loss_avg_len_1024": 6.30830627080104, "train_eval/perplexity_len_1024": 549.1141101383427, "train_eval/loss_avg_len_512": 6.314299033218994, "train_eval/perplexity_len_512": 552.4147004897641}
3
- {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4684.9639007470105, "train_eval/train_update_time": 2788.3531909267767, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.612906693891327, "train_eval/perplexity_len_2048": 273.93933969161543, "train_eval/loss_avg_len_1024": 5.618678117767923, "train_eval/perplexity_len_1024": 275.52493089567076, "train_eval/loss_avg_len_512": 5.63033959157925, "train_eval/perplexity_len_512": 278.75676501146086}
4
- {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6104.984292554029, "train_eval/train_update_time": 3596.637184852676, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.231335072982001, "train_eval/perplexity_len_2048": 187.04235209109083, "train_eval/loss_avg_len_1024": 5.240096947866332, "train_eval/perplexity_len_1024": 188.68839445620821, "train_eval/loss_avg_len_512": 5.255696088007826, "train_eval/perplexity_len_512": 191.65484807445753}
5
- {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7847.669167418033, "train_eval/train_update_time": 4567.680281858717, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.980712333356914, "train_eval/perplexity_len_2048": 145.57804482522243, "train_eval/loss_avg_len_1024": 4.989459048826356, "train_eval/perplexity_len_1024": 146.85695955994623, "train_eval/loss_avg_len_512": 5.007807282713766, "train_eval/perplexity_len_512": 149.5763975512185}
6
- {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9281.588875080051, "train_eval/train_update_time": 5478.766576015565, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.805959661819288, "train_eval/perplexity_len_2048": 122.23674068512044, "train_eval/loss_avg_len_1024": 4.8155524545809385, "train_eval/perplexity_len_1024": 123.41497464824776, "train_eval/loss_avg_len_512": 4.837010983348955, "train_eval/perplexity_len_512": 126.0918971220228}
7
- {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11014.551226956013, "train_eval/train_update_time": 6321.31326986436, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.663479676307288, "train_eval/perplexity_len_2048": 106.00430179998108, "train_eval/loss_avg_len_1024": 4.677575560776713, "train_eval/perplexity_len_1024": 107.50910705570554, "train_eval/loss_avg_len_512": 4.7043979177456645, "train_eval/perplexity_len_512": 110.43177591493382}
8
- {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12483.889439035032, "train_eval/train_update_time": 7321.039978560642, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.545382353753193, "train_eval/perplexity_len_2048": 94.19643668589313, "train_eval/loss_avg_len_1024": 4.5609763441320315, "train_eval/perplexity_len_1024": 95.67684777192169, "train_eval/loss_avg_len_512": 4.592948676595988, "train_eval/perplexity_len_512": 98.78528699362364}
9
- {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 14189.86244938703, "train_eval/train_update_time": 8136.602942996542, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.442484366352373, "train_eval/perplexity_len_2048": 84.98581552193247, "train_eval/loss_avg_len_1024": 4.464405859036887, "train_eval/perplexity_len_1024": 86.86940153422208, "train_eval/loss_avg_len_512": 4.504311476942166, "train_eval/perplexity_len_512": 90.40607594695932}
10
- {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 15681.829017209006, "train_eval/train_update_time": 9104.969846359396, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.349756015720322, "train_eval/perplexity_len_2048": 77.45956170418766, "train_eval/loss_avg_len_1024": 4.3738111120650505, "train_eval/perplexity_len_1024": 79.34545060552118, "train_eval/loss_avg_len_512": 4.4205777487907465, "train_eval/perplexity_len_512": 83.14430800790313}
11
- {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 17393.370531369, "train_eval/train_update_time": 10044.968985380197, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.263560818342057, "train_eval/perplexity_len_2048": 71.06257439048836, "train_eval/loss_avg_len_1024": 4.29050152948963, "train_eval/perplexity_len_1024": 73.00307251356847, "train_eval/loss_avg_len_512": 4.341866313039864, "train_eval/perplexity_len_512": 76.85083329331437}
12
- {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 18844.378502471023, "train_eval/train_update_time": 10883.880087946192, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.193009602014271, "train_eval/perplexity_len_2048": 66.22179259272077, "train_eval/loss_avg_len_1024": 4.2229134864690785, "train_eval/perplexity_len_1024": 68.23198795379636, "train_eval/loss_avg_len_512": 4.282069746723865, "train_eval/perplexity_len_512": 72.39011426182817}
13
- {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 20578.88107217505, "train_eval/train_update_time": 11891.677284248348, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.141280513120828, "train_eval/perplexity_len_2048": 62.883292797405154, "train_eval/loss_avg_len_1024": 4.178984067427664, "train_eval/perplexity_len_1024": 65.29947963586007, "train_eval/loss_avg_len_512": 4.243167548241873, "train_eval/perplexity_len_512": 69.62805312280105}
14
- {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 21992.386447268014, "train_eval/train_update_time": 12710.86209141434, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.095283453380916, "train_eval/perplexity_len_2048": 60.05635992330559, "train_eval/loss_avg_len_1024": 4.135403087018712, "train_eval/perplexity_len_1024": 62.5147848907405, "train_eval/loss_avg_len_512": 4.204605343623225, "train_eval/perplexity_len_512": 66.99415277872542}
15
- {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 23741.714356134005, "train_eval/train_update_time": 13647.155335442629, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.061455503564557, "train_eval/perplexity_len_2048": 58.05875433420926, "train_eval/loss_avg_len_1024": 4.103027131199469, "train_eval/perplexity_len_1024": 60.523222308524836, "train_eval/loss_avg_len_512": 4.175240817750128, "train_eval/perplexity_len_512": 65.05550429495806}
16
- {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 25184.550976835017, "train_eval/train_update_time": 14590.463811040623, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.038684661284387, "train_eval/perplexity_len_2048": 56.75164605324487, "train_eval/loss_avg_len_1024": 4.082819647277592, "train_eval/perplexity_len_1024": 59.31247454945333, "train_eval/loss_avg_len_512": 4.1564677283795755, "train_eval/perplexity_len_512": 63.84560381712993}
17
- {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 26014.853294235014, "train_eval/train_update_time": 15036.594645244593, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.0144972322152075, "train_eval/perplexity_len_2048": 55.39543735868876, "train_eval/loss_avg_len_1024": 4.053417672083615, "train_eval/perplexity_len_1024": 57.593958329267856, "train_eval/loss_avg_len_512": 4.128523392552015, "train_eval/perplexity_len_512": 62.086178302728364}
18
- {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 26688.83935572201, "train_eval/train_update_time": 15482.672627257823, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.009689314002153, "train_eval/perplexity_len_2048": 55.12973986422666, "train_eval/loss_avg_len_1024": 4.05262587939289, "train_eval/perplexity_len_1024": 57.54837390312208, "train_eval/loss_avg_len_512": 4.128599714143057, "train_eval/perplexity_len_512": 62.09091699946827}
19
- {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 27477.10213337402, "train_eval/train_update_time": 15928.765013526718, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.997795303428329, "train_eval/perplexity_len_2048": 54.47791027374324, "train_eval/loss_avg_len_1024": 4.043998843978606, "train_eval/perplexity_len_1024": 57.054037438461634, "train_eval/loss_avg_len_512": 4.119196443598303, "train_eval/perplexity_len_512": 61.509795810391275}
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 523.9415633900207, "train_eval/train_update_time": 283.0836244261591, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.44396705981344, "train_eval/perplexity_len_2048": 4646.953208330742, "train_eval/loss_avg_len_1024": 8.445717497379519, "train_eval/perplexity_len_1024": 4655.094533156048, "train_eval/loss_avg_len_512": 8.448021737337111, "train_eval/perplexity_len_512": 4665.833355646153}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1044.3968842840404, "train_eval/train_update_time": 562.4800082092406, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.903726465636282, "train_eval/perplexity_len_2048": 995.9792914347637, "train_eval/loss_avg_len_1024": 6.910232449487958, "train_eval/perplexity_len_1024": 1002.4802412277218, "train_eval/loss_avg_len_512": 6.921446407868643, "train_eval/perplexity_len_512": 1013.7852815864962}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1685.596672433021, "train_eval/train_update_time": 841.9682118772762, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.346844194442092, "train_eval/perplexity_len_2048": 570.6888811028199, "train_eval/loss_avg_len_1024": 6.355865296722332, "train_eval/perplexity_len_1024": 575.8604152670682, "train_eval/loss_avg_len_512": 6.373962009596871, "train_eval/perplexity_len_512": 586.3764618043518}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2205.8133425950073, "train_eval/train_update_time": 1121.4434441442718, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.020796056668696, "train_eval/perplexity_len_2048": 411.906366057056, "train_eval/loss_avg_len_1024": 6.03143457122933, "train_eval/perplexity_len_1024": 416.3118301770113, "train_eval/loss_avg_len_512": 6.050862906577386, "train_eval/perplexity_len_512": 424.47915791229906}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2845.8649667550344, "train_eval/train_update_time": 1400.8440078733838, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.778140883194901, "train_eval/perplexity_len_2048": 323.1578434258851, "train_eval/loss_avg_len_1024": 5.788657015446152, "train_eval/perplexity_len_1024": 326.5741457049287, "train_eval/loss_avg_len_512": 5.809641378527304, "train_eval/perplexity_len_512": 333.4995041431057}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3365.815582766023, "train_eval/train_update_time": 1680.2926608613343, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.594451416916781, "train_eval/perplexity_len_2048": 268.93007912652223, "train_eval/loss_avg_len_1024": 5.604763740769849, "train_eval/perplexity_len_1024": 271.71772203045913, "train_eval/loss_avg_len_512": 5.627147741099179, "train_eval/perplexity_len_512": 277.8684335620044}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4005.8882146670367, "train_eval/train_update_time": 1959.7687862973544, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.446014417338083, "train_eval/perplexity_len_2048": 231.83233521014682, "train_eval/loss_avg_len_1024": 5.45869701535783, "train_eval/perplexity_len_1024": 234.79129551578285, "train_eval/loss_avg_len_512": 5.483045409742335, "train_eval/perplexity_len_512": 240.57825223508894}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4526.608443362988, "train_eval/train_update_time": 2239.197950529342, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.335741158013298, "train_eval/perplexity_len_2048": 207.62657589630717, "train_eval/loss_avg_len_1024": 5.3480183006091835, "train_eval/perplexity_len_1024": 210.19134880139666, "train_eval/loss_avg_len_512": 5.372833612357645, "train_eval/perplexity_len_512": 215.47256920235168}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5168.203005189018, "train_eval/train_update_time": 2518.7051250302466, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.2465569315206455, "train_eval/perplexity_len_2048": 189.91126400613842, "train_eval/loss_avg_len_1024": 5.2607924398454635, "train_eval/perplexity_len_1024": 192.63408175185688, "train_eval/loss_avg_len_512": 5.287922178969602, "train_eval/perplexity_len_512": 197.93173113045845}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5687.226690375013, "train_eval/train_update_time": 2798.1550497164135, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.177761742937019, "train_eval/perplexity_len_2048": 177.2855559320088, "train_eval/loss_avg_len_1024": 5.191431534552903, "train_eval/perplexity_len_1024": 179.72565234595527, "train_eval/loss_avg_len_512": 5.21939211089848, "train_eval/perplexity_len_512": 184.8217987580525}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6327.491173934017, "train_eval/train_update_time": 3077.6138937743963, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.119764854961449, "train_eval/perplexity_len_2048": 167.29602616505414, "train_eval/loss_avg_len_1024": 5.130968105256434, "train_eval/perplexity_len_1024": 169.18082363462895, "train_eval/loss_avg_len_512": 5.1566913579488025, "train_eval/perplexity_len_512": 173.5891600118863}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6848.3877059380175, "train_eval/train_update_time": 3357.13471879164, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.074158342274458, "train_eval/perplexity_len_2048": 159.83760679341086, "train_eval/loss_avg_len_1024": 5.085221332270266, "train_eval/perplexity_len_1024": 161.6157060504098, "train_eval/loss_avg_len_512": 5.112118527772254, "train_eval/perplexity_len_512": 166.0217041617718}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7489.941271601012, "train_eval/train_update_time": 3636.640745670593, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.046691670220553, "train_eval/perplexity_len_2048": 155.50714361657919, "train_eval/loss_avg_len_1024": 5.06205342964935, "train_eval/perplexity_len_1024": 157.91444983555672, "train_eval/loss_avg_len_512": 5.089124586692706, "train_eval/perplexity_len_512": 162.24776601273498}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8010.996608729009, "train_eval/train_update_time": 3916.130362140422, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.012758545674861, "train_eval/perplexity_len_2048": 150.3188260910137, "train_eval/loss_avg_len_1024": 5.0276956282409815, "train_eval/perplexity_len_1024": 152.58100391545096, "train_eval/loss_avg_len_512": 5.057494392056979, "train_eval/perplexity_len_512": 157.19615054325925}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8654.290324420028, "train_eval/train_update_time": 4195.576348111499, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.991214820690475, "train_eval/perplexity_len_2048": 147.11503337066168, "train_eval/loss_avg_len_1024": 5.007109796925361, "train_eval/perplexity_len_1024": 149.4721065146317, "train_eval/loss_avg_len_512": 5.036278882724801, "train_eval/perplexity_len_512": 153.8962821102358}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9174.941196507018, "train_eval/train_update_time": 4474.963137161569, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.975023514186086, "train_eval/perplexity_len_2048": 144.75222883900852, "train_eval/loss_avg_len_1024": 4.991780238335013, "train_eval/perplexity_len_1024": 147.1982383268648, "train_eval/loss_avg_len_512": 5.02088207897912, "train_eval/perplexity_len_512": 151.54491944344505}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9814.94877268601, "train_eval/train_update_time": 4754.397899497591, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.9583539266328085, "train_eval/perplexity_len_2048": 142.35926912432882, "train_eval/loss_avg_len_1024": 4.970177483169246, "train_eval/perplexity_len_1024": 144.05245198792426, "train_eval/loss_avg_len_512": 4.999460103386228, "train_eval/perplexity_len_512": 148.33305296699936}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10334.590985039016, "train_eval/train_update_time": 5033.849530185806, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.958045211400968, "train_eval/perplexity_len_2048": 142.31532743264023, "train_eval/loss_avg_len_1024": 4.973284861676475, "train_eval/perplexity_len_1024": 144.5007736729874, "train_eval/loss_avg_len_512": 5.002621874920951, "train_eval/perplexity_len_512": 148.80279040143105}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10974.142334740027, "train_eval/train_update_time": 5313.31503523147, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.94873155637828, "train_eval/perplexity_len_2048": 140.9960049626137, "train_eval/loss_avg_len_1024": 4.965872568360719, "train_eval/perplexity_len_1024": 143.43365135181708, "train_eval/loss_avg_len_512": 4.994268174605532, "train_eval/perplexity_len_512": 147.56491410680255}
metrics/jsonlines/val.jsonl CHANGED
@@ -1,49 +1,49 @@
1
- {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 393.3033275610069, "val/train_update_time": 392.7191811740049, "val/loss": 8.011235007830686, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 267.4548993270146, "val/val_tokens_per_second": 153147.3160636276, "val/loss_avg_len_2048": 8.011235007830686, "val/perplexity_len_2048": 3014.637916232049, "val/loss_avg_len_1024": 8.010095824780269, "val/perplexity_len_1024": 3011.205647177279, "val/loss_avg_len_512": 8.010851014741046, "val/perplexity_len_512": 3013.4805383312287}
2
- {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 1014.3213793600444, "val/train_update_time": 745.8892338059959, "val/loss": 7.126910854516412, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 306.0501248690416, "val/val_tokens_per_second": 133834.28618931858, "val/loss_avg_len_2048": 7.126910854516412, "val/perplexity_len_2048": 1245.0249570440558, "val/loss_avg_len_1024": 7.127682446160587, "val/perplexity_len_1024": 1245.9859786081254, "val/loss_avg_len_512": 7.131570649632625, "val/perplexity_len_512": 1250.840056321792}
3
- {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 1633.8735543090152, "val/train_update_time": 1058.99380204099, "val/loss": 6.576425216046255, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 318.1715274940361, "val/val_tokens_per_second": 128735.59215875393, "val/loss_avg_len_2048": 6.576425216046255, "val/perplexity_len_2048": 717.9681552507965, "val/loss_avg_len_1024": 6.578036571076419, "val/perplexity_len_1024": 719.1259894397066, "val/loss_avg_len_512": 6.584267685109842, "val/perplexity_len_512": 723.6209352004278}
4
- {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 2296.034582130029, "val/train_update_time": 1402.5742697739624, "val/loss": 6.157745288895025, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 276.36380792903947, "val/val_tokens_per_second": 148210.43430736448, "val/loss_avg_len_2048": 6.157745288895025, "val/perplexity_len_2048": 472.3618337815753, "val/loss_avg_len_1024": 6.160346911049076, "val/perplexity_len_1024": 473.59234075612176, "val/loss_avg_len_512": 6.168603237254825, "val/perplexity_len_512": 477.5186597915891}
5
- {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 2950.672681943048, "val/train_update_time": 1780.2358716417802, "val/loss": 5.881033806435461, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 242.76836909895064, "val/val_tokens_per_second": 168720.49745205892, "val/loss_avg_len_2048": 5.881033806435461, "val/perplexity_len_2048": 358.17933847684697, "val/loss_avg_len_1024": 5.885018851229502, "val/perplexity_len_1024": 359.60954701480983, "val/loss_avg_len_512": 5.895661722176336, "val/perplexity_len_512": 363.45726407358717}
6
- {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 3595.7228351200465, "val/train_update_time": 2181.321583448793, "val/loss": 5.645073470176547, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 225.88207412901102, "val/val_tokens_per_second": 181333.5571578202, "val/loss_avg_len_2048": 5.645073470176547, "val/perplexity_len_2048": 282.89433975637286, "val/loss_avg_len_1024": 5.6507063047187405, "val/perplexity_len_1024": 284.49233315615174, "val/loss_avg_len_512": 5.663727438149043, "val/perplexity_len_512": 288.22096863377254}
7
- {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 4217.221132909006, "val/train_update_time": 2576.439015968761, "val/loss": 5.478189624139876, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 255.62614679103717, "val/val_tokens_per_second": 160233.9999807725, "val/loss_avg_len_2048": 5.478189624139876, "val/perplexity_len_2048": 239.4128874921015, "val/loss_avg_len_1024": 5.484750836705976, "val/perplexity_len_1024": 240.9888909309404, "val/loss_avg_len_512": 5.498833170221839, "val/perplexity_len_512": 244.40658493943135}
8
- {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 4838.394258918008, "val/train_update_time": 2941.477261595719, "val/loss": 5.316759455512837, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 292.2664110100013, "val/val_tokens_per_second": 140146.10799254093, "val/loss_avg_len_2048": 5.316759455512837, "val/perplexity_len_2048": 203.72263890338473, "val/loss_avg_len_1024": 5.324367502401863, "val/perplexity_len_1024": 205.27848124862788, "val/loss_avg_len_512": 5.339848569961358, "val/perplexity_len_512": 208.48113759190758}
9
- {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 5457.476125103014, "val/train_update_time": 3267.897231984767, "val/loss": 5.191145182319521, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 318.3786125089973, "val/val_tokens_per_second": 128651.85785317938, "val/loss_avg_len_2048": 5.191145182319521, "val/perplexity_len_2048": 179.6741948718429, "val/loss_avg_len_1024": 5.199742717809835, "val/perplexity_len_1024": 181.2256097548097, "val/loss_avg_len_512": 5.216735304176062, "val/perplexity_len_512": 184.3314146769773}
10
- {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 6104.984292554029, "val/train_update_time": 3596.637184852676, "val/loss": 5.0863237302411815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 290.45753065496683, "val/val_tokens_per_second": 141018.89494012189, "val/loss_avg_len_2048": 5.0863237302411815, "val/perplexity_len_2048": 161.79396911719607, "val/loss_avg_len_1024": 5.095744642082509, "val/perplexity_len_1024": 163.3254183339546, "val/loss_avg_len_512": 5.11419914171081, "val/perplexity_len_512": 166.3674908330965}
11
- {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 6762.651720013004, "val/train_update_time": 3962.670169218676, "val/loss": 4.994484119823133, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 254.08249414898455, "val/val_tokens_per_second": 161207.4855341375, "val/loss_avg_len_2048": 4.994484119823133, "val/perplexity_len_2048": 147.59678348518568, "val/loss_avg_len_1024": 5.004749132925971, "val/perplexity_len_1024": 149.11966925052025, "val/loss_avg_len_512": 5.024751520049106, "val/perplexity_len_512": 152.13244955218386}
12
- {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 7413.931084726995, "val/train_update_time": 4359.249880091811, "val/loss": 4.917526679099561, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 224.98638561402913, "val/val_tokens_per_second": 182055.4603257999, "val/loss_avg_len_2048": 4.917526679099561, "val/perplexity_len_2048": 136.66418045875457, "val/loss_avg_len_1024": 4.928665879917425, "val/perplexity_len_1024": 138.1950205479977, "val/loss_avg_len_512": 4.950037384534534, "val/perplexity_len_512": 141.18024178044544}
13
- {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 8040.308323490026, "val/train_update_time": 4760.0058897937415, "val/loss": 4.846403618092416, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 243.83976028999314, "val/val_tokens_per_second": 167979.1677587248, "val/loss_avg_len_2048": 4.846403618092416, "val/perplexity_len_2048": 127.281811724418, "val/loss_avg_len_1024": 4.85831240278692, "val/perplexity_len_1024": 128.80664484962617, "val/loss_avg_len_512": 4.881023855017313, "val/perplexity_len_512": 131.7655036178709}
14
- {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 8661.466409554007, "val/train_update_time": 5136.805537326669, "val/loss": 4.785894058128424, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 277.76102412998443, "val/val_tokens_per_second": 147464.89406962966, "val/loss_avg_len_2048": 4.785894058128424, "val/perplexity_len_2048": 119.80843091580816, "val/loss_avg_len_1024": 4.7990697121755685, "val/perplexity_len_1024": 121.39743042018526, "val/loss_avg_len_512": 4.823691271648556, "val/perplexity_len_512": 124.4235251872631}
15
- {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 9281.588875080051, "val/train_update_time": 5478.766576015565, "val/loss": 4.726049356866, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 318.06440460996237, "val/val_tokens_per_second": 128778.94981750201, "val/loss_avg_len_2048": 4.726049356866, "val/perplexity_len_2048": 112.84885498484593, "val/loss_avg_len_1024": 4.740272974298615, "val/perplexity_len_1024": 114.4654435400183, "val/loss_avg_len_512": 4.766912040895224, "val/perplexity_len_512": 117.55567379454433}
16
- {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 9914.599941182008, "val/train_update_time": 5792.780345006497, "val/loss": 4.670938681109856, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 304.96344034397043, "val/val_tokens_per_second": 134311.18154294472, "val/loss_avg_len_2048": 4.670938681109856, "val/perplexity_len_2048": 106.79794460932423, "val/loss_avg_len_1024": 4.68608565857932, "val/perplexity_len_1024": 108.42792413784692, "val/loss_avg_len_512": 4.7141645993726335, "val/perplexity_len_512": 111.51561203756256}
17
- {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 10574.152368383016, "val/train_update_time": 6146.868059828412, "val/loss": 4.6206715288446985, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 265.6431397149572, "val/val_tokens_per_second": 154191.82307493905, "val/loss_avg_len_2048": 4.6206715288446985, "val/perplexity_len_2048": 101.56221118922583, "val/loss_avg_len_1024": 4.637006295820186, "val/perplexity_len_1024": 103.23482997037543, "val/loss_avg_len_512": 4.6670295943367295, "val/perplexity_len_512": 106.38127710196552}
18
- {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 11226.902240763011, "val/train_update_time": 6533.353484157473, "val/loss": 4.57530604224482, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 234.03315117699094, "val/val_tokens_per_second": 175017.93995425635, "val/loss_avg_len_2048": 4.57530604224482, "val/perplexity_len_2048": 97.05773836021801, "val/loss_avg_len_1024": 4.59291932442165, "val/perplexity_len_1024": 98.78238747321159, "val/loss_avg_len_512": 4.6248253861254085, "val/perplexity_len_512": 101.98496353821467}
19
- {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 11862.473320519028, "val/train_update_time": 6934.267906721507, "val/loss": 4.536200403668335, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 234.15100623102626, "val/val_tokens_per_second": 174929.84830305027, "val/loss_avg_len_2048": 4.536200403668335, "val/perplexity_len_2048": 93.3354883471502, "val/loss_avg_len_1024": 4.555115518093761, "val/perplexity_len_1024": 95.11774242124372, "val/loss_avg_len_512": 4.5892477317730895, "val/perplexity_len_512": 98.42036379398839}
20
- {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 12483.889439035032, "val/train_update_time": 7321.039978560642, "val/loss": 4.493805073933746, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 265.7580173549941, "val/val_tokens_per_second": 154125.17149120086, "val/loss_avg_len_2048": 4.493805073933746, "val/perplexity_len_2048": 89.46120556473534, "val/loss_avg_len_1024": 4.514267320341524, "val/perplexity_len_1024": 91.3106400591625, "val/loss_avg_len_512": 4.5507813966460535, "val/perplexity_len_512": 94.70638265927766}
21
- {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 13104.464458574017, "val/train_update_time": 7674.994709033519, "val/loss": 4.453331883506127, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 305.22737336700084, "val/val_tokens_per_second": 134195.04138231504, "val/loss_avg_len_2048": 4.453331883506127, "val/perplexity_len_2048": 85.91271882443378, "val/loss_avg_len_1024": 4.475549799342454, "val/perplexity_len_1024": 87.84288309500913, "val/loss_avg_len_512": 4.514732480961177, "val/perplexity_len_512": 91.35312405324778}
22
- {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 13721.99477496103, "val/train_update_time": 7986.941047978529, "val/loss": 4.41391773877088, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 318.02968962700106, "val/val_tokens_per_second": 128793.00686687352, "val/loss_avg_len_2048": 4.41391773877088, "val/perplexity_len_2048": 82.59240596006008, "val/loss_avg_len_1024": 4.437905768002942, "val/perplexity_len_1024": 84.59758905201315, "val/loss_avg_len_512": 4.480083948131558, "val/perplexity_len_512": 88.24208012247752}
23
- {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 14381.773025262053, "val/train_update_time": 8328.335106454615, "val/loss": 4.375631547093648, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 277.73073812999064, "val/val_tokens_per_second": 147480.9748312009, "val/loss_avg_len_2048": 4.375631547093648, "val/perplexity_len_2048": 79.4900253977448, "val/loss_avg_len_1024": 4.401483430209105, "val/perplexity_len_1024": 81.57178500740743, "val/loss_avg_len_512": 4.44655481751617, "val/perplexity_len_512": 85.3324511366961}
24
- {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 15035.86255901301, "val/train_update_time": 8704.087580051564, "val/loss": 4.339762180510187, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 244.46324991004076, "val/val_tokens_per_second": 167550.7464417361, "val/loss_avg_len_2048": 4.339762180510187, "val/perplexity_len_2048": 76.6892989594701, "val/loss_avg_len_1024": 4.3674431190324015, "val/perplexity_len_1024": 78.84178470140286, "val/loss_avg_len_512": 4.415505284006894, "val/perplexity_len_512": 82.72362927458667}
25
- {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 15681.829017209006, "val/train_update_time": 9104.969846359396, "val/loss": 4.306412981705158, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 225.08503117703367, "val/val_tokens_per_second": 181975.6728637551, "val/loss_avg_len_2048": 4.306412981705158, "val/perplexity_len_2048": 74.17394787427155, "val/loss_avg_len_1024": 4.335562960951775, "val/perplexity_len_1024": 76.3679389581835, "val/loss_avg_len_512": 4.386069002373144, "val/perplexity_len_512": 80.32404392703333}
26
- {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 16303.791181104025, "val/train_update_time": 9500.859303092468, "val/loss": 4.2748331887701765, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 254.23934659303632, "val/val_tokens_per_second": 161108.0289061831, "val/loss_avg_len_2048": 4.2748331887701765, "val/perplexity_len_2048": 71.8681498952332, "val/loss_avg_len_1024": 4.306274790479569, "val/perplexity_len_1024": 74.16369839371849, "val/loss_avg_len_512": 4.360181407163851, "val/perplexity_len_512": 78.27133211563368}
27
- {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 16924.293944418023, "val/train_update_time": 9866.63232135144, "val/loss": 4.246361440885579, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 290.5493878980051, "val/val_tokens_per_second": 140974.31179025117, "val/loss_avg_len_2048": 4.246361440885579, "val/perplexity_len_2048": 69.85079316400844, "val/loss_avg_len_1024": 4.279776382185658, "val/perplexity_len_1024": 72.22428756407818, "val/loss_avg_len_512": 4.336207040463295, "val/perplexity_len_512": 76.41714182658222}
28
- {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 17543.088614015025, "val/train_update_time": 10194.5136101502, "val/loss": 4.216620016333461, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 317.9768462689826, "val/val_tokens_per_second": 128814.41048494192, "val/loss_avg_len_2048": 4.216620016333461, "val/perplexity_len_2048": 67.8039204033866, "val/loss_avg_len_1024": 4.251321952285059, "val/perplexity_len_1024": 70.19814964036787, "val/loss_avg_len_512": 4.309930654432904, "val/perplexity_len_512": 74.43532700140584}
29
- {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 18186.98511047603, "val/train_update_time": 10520.074710074114, "val/loss": 4.190980965757324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 292.99493647099007, "val/val_tokens_per_second": 139797.63777950313, "val/loss_avg_len_2048": 4.190980965757324, "val/perplexity_len_2048": 66.08758883459666, "val/loss_avg_len_1024": 4.227650031735189, "val/perplexity_len_1024": 68.55593845078425, "val/loss_avg_len_512": 4.28926362375198, "val/perplexity_len_512": 72.91275750350573}
30
- {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 18844.378502471023, "val/train_update_time": 10883.880087946192, "val/loss": 4.169307191771875, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 256.15296956995735, "val/val_tokens_per_second": 159904.45111280863, "val/loss_avg_len_2048": 4.169307191771875, "val/perplexity_len_2048": 64.67063223727632, "val/loss_avg_len_1024": 4.206767542274017, "val/perplexity_len_1024": 67.13916416068649, "val/loss_avg_len_512": 4.269754838026688, "val/perplexity_len_512": 71.50410338310292}
31
- {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 19496.670543703018, "val/train_update_time": 11278.836069991172, "val/loss": 4.146260214753799, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 226.11030463204952, "val/val_tokens_per_second": 181150.52326630766, "val/loss_avg_len_2048": 4.146260214753799, "val/perplexity_len_2048": 63.19721380099956, "val/loss_avg_len_1024": 4.185796775772237, "val/perplexity_len_1024": 65.74586476501416, "val/loss_avg_len_512": 4.251360247742944, "val/perplexity_len_512": 70.20083796212602}
32
- {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 20124.23348677403, "val/train_update_time": 11679.664113470295, "val/loss": 4.126677757895738, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 242.41475300304592, "val/val_tokens_per_second": 168966.61400589487, "val/loss_avg_len_2048": 4.126677757895738, "val/perplexity_len_2048": 61.97169557951531, "val/loss_avg_len_1024": 4.167139344774094, "val/perplexity_len_1024": 64.53058805336352, "val/loss_avg_len_512": 4.234325625280757, "val/perplexity_len_512": 69.015120983761}
33
- {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 20744.93009287404, "val/train_update_time": 12057.427525710431, "val/loss": 4.10886636920611, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 275.83039805601584, "val/val_tokens_per_second": 148497.04850761886, "val/loss_avg_len_2048": 4.10886636920611, "val/perplexity_len_2048": 60.87766564089016, "val/loss_avg_len_1024": 4.150810060911067, "val/perplexity_len_1024": 63.485406520472054, "val/loss_avg_len_512": 4.219681530099177, "val/perplexity_len_512": 68.01182112214181}
34
- {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 21364.400057210005, "val/train_update_time": 12400.716242285387, "val/loss": 4.093701358500426, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 317.5061725229607, "val/val_tokens_per_second": 129005.36602020846, "val/loss_avg_len_2048": 4.093701358500426, "val/perplexity_len_2048": 59.961420185369775, "val/loss_avg_len_1024": 4.136613275307044, "val/perplexity_len_1024": 62.59048534792726, "val/loss_avg_len_512": 4.206974884290714, "val/perplexity_len_512": 67.15308637366292}
35
- {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 21992.386447268014, "val/train_update_time": 12710.86209141434, "val/loss": 4.0797728683573435, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 307.7034010089701, "val/val_tokens_per_second": 133115.2007605075, "val/loss_avg_len_2048": 4.0797728683573435, "val/perplexity_len_2048": 59.13203756766548, "val/loss_avg_len_1024": 4.123041846866394, "val/perplexity_len_1024": 61.74678114027358, "val/loss_avg_len_512": 4.1944116867515255, "val/perplexity_len_512": 66.31470627859792}
36
- {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 22653.044829952996, "val/train_update_time": 13062.762519414595, "val/loss": 4.067890912848758, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 267.4376528779976, "val/val_tokens_per_second": 153157.19218746488, "val/loss_avg_len_2048": 4.067890912848758, "val/perplexity_len_2048": 58.43359100092499, "val/loss_avg_len_1024": 4.11147258811649, "val/perplexity_len_1024": 61.036533096062215, "val/loss_avg_len_512": 4.183689289411436, "val/perplexity_len_512": 65.60745215431737}
37
- {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 23305.888173635, "val/train_update_time": 13447.544946793583, "val/loss": 4.056387900622422, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 235.9053416660172, "val/val_tokens_per_second": 173628.96367980124, "val/loss_avg_len_2048": 4.056387900622422, "val/perplexity_len_2048": 57.765279854047876, "val/loss_avg_len_1024": 4.101134180352232, "val/perplexity_len_1024": 60.4087631904987, "val/loss_avg_len_512": 4.17459845949933, "val/perplexity_len_512": 65.01372877387615}
38
- {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 23943.319328956015, "val/train_update_time": 13848.439878219564, "val/loss": 4.046861652540835, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 233.5193387119798, "val/val_tokens_per_second": 175403.03182564085, "val/loss_avg_len_2048": 4.046861652540835, "val/perplexity_len_2048": 57.21760624672959, "val/loss_avg_len_1024": 4.0928021227691325, "val/perplexity_len_1024": 59.90752496972356, "val/loss_avg_len_512": 4.167273688912205, "val/perplexity_len_512": 64.53925794195865}
39
- {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 24564.46875228104, "val/train_update_time": 14235.556923599623, "val/loss": 4.038371388106723, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 264.80620891199214, "val/val_tokens_per_second": 154679.15260858924, "val/loss_avg_len_2048": 4.038371388106723, "val/perplexity_len_2048": 56.73387006926299, "val/loss_avg_len_1024": 4.084637223889167, "val/perplexity_len_1024": 59.42037754724601, "val/loss_avg_len_512": 4.159778234884888, "val/perplexity_len_512": 64.05731534674663}
40
- {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 25184.550976835017, "val/train_update_time": 14590.463811040623, "val/loss": 4.031109951576823, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 155.41316219599685, "val/val_tokens_per_second": 263555.5407356292, "val/loss_avg_len_2048": 4.031109951576823, "val/perplexity_len_2048": 56.32339280361645, "val/loss_avg_len_1024": 4.077683951509232, "val/perplexity_len_1024": 59.00864458184025, "val/loss_avg_len_512": 4.153272050888557, "val/perplexity_len_512": 63.64189951809101}
41
- {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 25519.29751138203, "val/train_update_time": 14768.911977301526, "val/loss": 4.024918882568856, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.86328060599044, "val/val_tokens_per_second": 359729.6668601788, "val/loss_avg_len_2048": 4.024918882568856, "val/perplexity_len_2048": 55.97576798697239, "val/loss_avg_len_1024": 4.0717748931922015, "val/perplexity_len_1024": 58.660987235159695, "val/loss_avg_len_512": 4.147944821586087, "val/perplexity_len_512": 63.303765983218696}
42
- {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 25811.900586274045, "val/train_update_time": 14947.35764870455, "val/loss": 4.020128366824519, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.57747345400276, "val/val_tokens_per_second": 360634.8931206698, "val/loss_avg_len_2048": 4.020128366824519, "val/perplexity_len_2048": 55.708256459815, "val/loss_avg_len_1024": 4.067610871110111, "val/perplexity_len_1024": 58.41722944757159, "val/loss_avg_len_512": 4.144215765030589, "val/perplexity_len_512": 63.06814225983229}
43
- {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 26104.19660033105, "val/train_update_time": 15125.80504047859, "val/loss": 4.015582729529799, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.59510378103005, "val/val_tokens_per_second": 360578.9214203805, "val/loss_avg_len_2048": 4.015582729529799, "val/perplexity_len_2048": 55.4556016053387, "val/loss_avg_len_1024": 4.063153423574707, "val/perplexity_len_1024": 58.15741719207481, "val/loss_avg_len_512": 4.140098871274945, "val/perplexity_len_512": 62.809031151203996}
44
- {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 26396.506954086013, "val/train_update_time": 15304.24179791665, "val/loss": 4.012362551404745, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.62905184895499, "val/val_tokens_per_second": 360471.1940608937, "val/loss_avg_len_2048": 4.012362551404745, "val/perplexity_len_2048": 55.277311906496806, "val/loss_avg_len_1024": 4.060099166782666, "val/perplexity_len_1024": 57.980060489936854, "val/loss_avg_len_512": 4.137323217885289, "val/perplexity_len_512": 62.63493677558229}
45
- {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 26688.83935572201, "val/train_update_time": 15482.672627257823, "val/loss": 4.0098582680169725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.5941264309804, "val/val_tokens_per_second": 360582.02379757044, "val/loss_avg_len_2048": 4.0098582680169725, "val/perplexity_len_2048": 55.139055042008856, "val/loss_avg_len_1024": 4.057653654730785, "val/perplexity_len_1024": 57.83844278771777, "val/loss_avg_len_512": 4.134992124096118, "val/perplexity_len_512": 62.48909891039072}
46
- {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 26981.709376943007, "val/train_update_time": 15661.105290838808, "val/loss": 4.0080952594994805, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.74540637602331, "val/val_tokens_per_second": 360102.4542880711, "val/loss_avg_len_2048": 4.0080952594994805, "val/perplexity_len_2048": 55.04193005956597, "val/loss_avg_len_1024": 4.055931827918859, "val/perplexity_len_1024": 57.73894069343542, "val/loss_avg_len_512": 4.133420608581789, "val/perplexity_len_512": 62.3909734450169}
47
- {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 27274.170978915005, "val/train_update_time": 15839.539326993749, "val/loss": 4.006876775859995, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.56194219802273, "val/val_tokens_per_second": 360684.2152151319, "val/loss_avg_len_2048": 4.006876775859995, "val/perplexity_len_2048": 54.97490321215408, "val/loss_avg_len_1024": 4.054881287327689, "val/perplexity_len_1024": 57.67831544276481, "val/loss_avg_len_512": 4.132452442725096, "val/perplexity_len_512": 62.33059786626467}
48
- {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 27566.45336975, "val/train_update_time": 16017.985979812802, "val/loss": 4.006194728950761, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.64657631900627, "val/val_tokens_per_second": 360415.608869951, "val/loss_avg_len_2048": 4.006194728950761, "val/perplexity_len_2048": 54.93742053325847, "val/loss_avg_len_1024": 4.05419063684349, "val/perplexity_len_1024": 57.638493639333326, "val/loss_avg_len_512": 4.131800313583575, "val/perplexity_len_512": 62.289963517849074}
49
- {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 27858.825974697014, "val/train_update_time": 16196.423335834814, "val/loss": 4.005892366719665, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 113.76460951002082, "val/val_tokens_per_second": 360041.6700449544, "val/loss_avg_len_2048": 4.005892366719665, "val/perplexity_len_2048": 54.92081204323195, "val/loss_avg_len_1024": 4.053872466314816, "val/perplexity_len_1024": 57.62015768647472, "val/loss_avg_len_512": 4.13152233827468, "val/perplexity_len_512": 62.27265085235736}
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 115.77270240103826, "val/train_update_time": 115.42706217308296, "val/loss": 8.015673879027368, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.22489377000602, "val/val_tokens_per_second": 340694.8321231854, "val/loss_avg_len_2048": 8.015673879027368, "val/perplexity_len_2048": 3028.0492492157514, "val/loss_avg_len_1024": 8.014426561307907, "val/perplexity_len_1024": 3024.2746642745205, "val/loss_avg_len_512": 8.015268499183655, "val/perplexity_len_512": 3026.8219878546656}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 348.0203448670218, "val/train_update_time": 227.1939903421444, "val/loss": 7.432892477142811, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.90724678500555, "val/val_tokens_per_second": 341597.36878490367, "val/loss_avg_len_2048": 7.432892477142811, "val/perplexity_len_2048": 1690.690792134549, "val/loss_avg_len_1024": 7.434750729179383, "val/perplexity_len_1024": 1693.8354426139251, "val/loss_avg_len_512": 7.4413878100395205, "val/perplexity_len_512": 1705.114955523971}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 579.9503910699859, "val/train_update_time": 338.96602103614714, "val/loss": 7.108143065983057, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.57100503600668, "val/val_tokens_per_second": 339716.8331454807, "val/loss_avg_len_2048": 7.108143065983057, "val/perplexity_len_2048": 1221.8764941210595, "val/loss_avg_len_1024": 7.111402546131611, "val/perplexity_len_1024": 1225.8656740922706, "val/loss_avg_len_512": 7.121028823280335, "val/perplexity_len_512": 1237.7231770584187}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 812.5387418420287, "val/train_update_time": 450.7242119802977, "val/loss": 6.80320226866305, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.84687927697087, "val/val_tokens_per_second": 341769.4331893267, "val/loss_avg_len_2048": 6.80320226866305, "val/perplexity_len_2048": 900.7270483130554, "val/loss_avg_len_1024": 6.8087645598977815, "val/perplexity_len_1024": 905.7511141842256, "val/loss_avg_len_512": 6.8227169809758665, "val/perplexity_len_512": 918.4771078858294}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1044.3968842840404, "val/train_update_time": 562.4800082092406, "val/loss": 6.553184749881178, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.57042892603204, "val/val_tokens_per_second": 339718.45638144226, "val/loss_avg_len_2048": 6.553184749881178, "val/perplexity_len_2048": 701.4746414709258, "val/loss_avg_len_1024": 6.560025909277051, "val/perplexity_len_1024": 706.2899938214338, "val/loss_avg_len_512": 6.576782421255111, "val/perplexity_len_512": 718.2246630258743}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1277.4677367979893, "val/train_update_time": 674.2627646423061, "val/loss": 6.368516060034558, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.46988375496585, "val/val_tokens_per_second": 342847.9103906173, "val/loss_avg_len_2048": 6.368516060034558, "val/perplexity_len_2048": 583.1917648892949, "val/loss_avg_len_1024": 6.376286262953282, "val/perplexity_len_1024": 587.7409343394978, "val/loss_avg_len_512": 6.39470073364526, "val/perplexity_len_512": 598.6641363781473}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1509.0055129660177, "val/train_update_time": 786.0731882582186, "val/loss": 6.228333079337515, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.57018486200832, "val/val_tokens_per_second": 339719.1440560402, "val/loss_avg_len_2048": 6.228333079337515, "val/perplexity_len_2048": 506.9098004279741, "val/loss_avg_len_1024": 6.2365625457014895, "val/perplexity_len_1024": 511.0986097727111, "val/loss_avg_len_512": 6.255868876511604, "val/perplexity_len_512": 521.0619165878901}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1741.6182956020348, "val/train_update_time": 897.8648126212647, "val/loss": 6.0991746725922455, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.50438052997924, "val/val_tokens_per_second": 339904.6559125701, "val/loss_avg_len_2048": 6.0991746725922455, "val/perplexity_len_2048": 445.489943254568, "val/loss_avg_len_1024": 6.108094833559171, "val/perplexity_len_1024": 449.4815617297244, "val/loss_avg_len_512": 6.1286831315368415, "val/perplexity_len_512": 458.83154187819093}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1974.1614223060315, "val/train_update_time": 1009.6508660353138, "val/loss": 5.986035494066402, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.61198412301019, "val/val_tokens_per_second": 342440.60325825145, "val/loss_avg_len_2048": 5.986035494066402, "val/perplexity_len_2048": 397.8342630532667, "val/loss_avg_len_1024": 5.995515969962067, "val/perplexity_len_1024": 401.62385638490093, "val/loss_avg_len_512": 6.017069597534835, "val/perplexity_len_512": 410.37427023615345}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2205.8133425950073, "val/train_update_time": 1121.4434441442718, "val/loss": 5.883162301799842, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.17791116697481, "val/val_tokens_per_second": 343687.84952618263, "val/loss_avg_len_2048": 5.883162301799842, "val/perplexity_len_2048": 358.9425334787863, "val/loss_avg_len_1024": 5.893183481512219, "val/perplexity_len_1024": 362.5576446991801, "val/loss_avg_len_512": 5.915859284684807, "val/perplexity_len_512": 370.8728511838533}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2437.459500588011, "val/train_update_time": 1233.201419278339, "val/loss": 5.795106819085591, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.89575185399735, "val/val_tokens_per_second": 341630.119221229, "val/loss_avg_len_2048": 5.795106819085591, "val/perplexity_len_2048": 328.6872921828014, "val/loss_avg_len_1024": 5.80581705780169, "val/perplexity_len_1024": 332.2265307581603, "val/loss_avg_len_512": 5.8295803298715505, "val/perplexity_len_512": 340.21587066949}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2669.367173767998, "val/train_update_time": 1344.9662226063083, "val/loss": 5.713464564111806, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.50630050798645, "val/val_tokens_per_second": 339899.2403495567, "val/loss_avg_len_2048": 5.713464564111806, "val/perplexity_len_2048": 302.9187337565111, "val/loss_avg_len_1024": 5.72461780461669, "val/perplexity_len_1024": 306.3161702421054, "val/loss_avg_len_512": 5.748771319710836, "val/perplexity_len_512": 313.8048574770508}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2901.8806591850007, "val/train_update_time": 1456.7321346284007, "val/loss": 5.634124744190066, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.06040904897964, "val/val_tokens_per_second": 341161.5896068622, "val/loss_avg_len_2048": 5.634124744190066, "val/perplexity_len_2048": 279.8139013573606, "val/loss_avg_len_1024": 5.645964613835513, "val/perplexity_len_1024": 283.1465516152128, "val/loss_avg_len_512": 5.671252074276097, "val/perplexity_len_512": 290.3979066084578}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3133.951200322015, "val/train_update_time": 1568.5041498313076, "val/loss": 5.5654044165277625, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.84437395603163, "val/val_tokens_per_second": 341776.5778060417, "val/loss_avg_len_2048": 5.5654044165277625, "val/perplexity_len_2048": 261.23082841471256, "val/loss_avg_len_1024": 5.577634366972465, "val/perplexity_len_1024": 264.4452847558606, "val/loss_avg_len_512": 5.603337141299248, "val/perplexity_len_512": 271.33036603867896}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3365.815582766023, "val/train_update_time": 1680.2926608613343, "val/loss": 5.505503168662952, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.83285554096801, "val/val_tokens_per_second": 341809.42960169003, "val/loss_avg_len_2048": 5.505503168662952, "val/perplexity_len_2048": 246.04222527524095, "val/loss_avg_len_1024": 5.518010230046278, "val/perplexity_len_1024": 249.13881474127635, "val/loss_avg_len_512": 5.544472198890244, "val/perplexity_len_512": 255.81952077770256}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3598.114992738003, "val/train_update_time": 1792.082390839234, "val/loss": 5.451877971532068, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.85984571097651, "val/val_tokens_per_second": 341732.4605837447, "val/loss_avg_len_2048": 5.451877971532068, "val/perplexity_len_2048": 233.1956898172289, "val/loss_avg_len_1024": 5.464605613744981, "val/perplexity_len_1024": 236.182689532032, "val/loss_avg_len_512": 5.491216098095244, "val/perplexity_len_512": 242.55199459484575}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3829.9891319620074, "val/train_update_time": 1903.884795576334, "val/loss": 5.408561823876447, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.89821732096607, "val/val_tokens_per_second": 341623.09428129846, "val/loss_avg_len_2048": 5.408561823876447, "val/perplexity_len_2048": 223.31019723410046, "val/loss_avg_len_1024": 5.421516989876121, "val/perplexity_len_1024": 226.22203887867025, "val/loss_avg_len_512": 5.448322415597644, "val/perplexity_len_512": 232.368021780364}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4061.88242925104, "val/train_update_time": 2015.6589334552991, "val/loss": 5.360722617470438, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.4022120119771, "val/val_tokens_per_second": 340193.0854553193, "val/loss_avg_len_2048": 5.360722617470438, "val/perplexity_len_2048": 212.87872076556144, "val/loss_avg_len_1024": 5.374010867381096, "val/perplexity_len_1024": 215.7263847403555, "val/loss_avg_len_512": 5.401363052465115, "val/perplexity_len_512": 221.70841053588902}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4294.288298399013, "val/train_update_time": 2127.428580871492, "val/loss": 5.324311214937968, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.29681166395312, "val/val_tokens_per_second": 340491.1521214792, "val/loss_avg_len_2048": 5.324311214937968, "val/perplexity_len_2048": 205.26692696870975, "val/loss_avg_len_1024": 5.337608720328915, "val/perplexity_len_1024": 208.01469376952264, "val/loss_avg_len_512": 5.36512916255719, "val/perplexity_len_512": 213.81885028692912}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4526.608443362988, "val/train_update_time": 2239.197950529342, "val/loss": 5.286909812269924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.71425316296518, "val/val_tokens_per_second": 339313.7009654004, "val/loss_avg_len_2048": 5.286909812269924, "val/perplexity_len_2048": 197.73145303159654, "val/loss_avg_len_1024": 5.3004279320558885, "val/perplexity_len_1024": 200.42255886379493, "val/loss_avg_len_512": 5.3283431476617, "val/perplexity_len_512": 206.0962201122877}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4759.794200062985, "val/train_update_time": 2350.9869147563586, "val/loss": 5.25467645336233, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.73026912601199, "val/val_tokens_per_second": 339268.6879315085, "val/loss_avg_len_2048": 5.25467645336233, "val/perplexity_len_2048": 191.45952974497322, "val/loss_avg_len_1024": 5.268592918162723, "val/perplexity_len_1024": 194.14259564559134, "val/loss_avg_len_512": 5.296714650505549, "val/perplexity_len_512": 199.67971352419883}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4992.560281209007, "val/train_update_time": 2462.802181679348, "val/loss": 5.223173501159495, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.62471787503455, "val/val_tokens_per_second": 342404.1513125129, "val/loss_avg_len_2048": 5.223173501159495, "val/perplexity_len_2048": 185.5220051503669, "val/loss_avg_len_1024": 5.237343601963297, "val/perplexity_len_1024": 188.16958459763552, "val/loss_avg_len_512": 5.265995957652177, "val/perplexity_len_512": 193.63906909356123}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5224.203345166985, "val/train_update_time": 2574.5941846003407, "val/loss": 5.195035318465339, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.14392835099716, "val/val_tokens_per_second": 343785.8778613723, "val/loss_avg_len_2048": 5.195035318465339, "val/perplexity_len_2048": 180.37451323550582, "val/loss_avg_len_1024": 5.209246023547824, "val/perplexity_len_1024": 182.95606161492964, "val/loss_avg_len_512": 5.237928895593109, "val/perplexity_len_512": 188.27975129361263}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5455.381416677032, "val/train_update_time": 2686.3787583513767, "val/loss": 5.172354661972123, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.80233659001533, "val/val_tokens_per_second": 341896.50357298396, "val/loss_avg_len_2048": 5.172354661972123, "val/perplexity_len_2048": 176.32954551923112, "val/loss_avg_len_1024": 5.186509269822098, "val/perplexity_len_1024": 178.84316879680094, "val/loss_avg_len_512": 5.215404340788373, "val/perplexity_len_512": 184.08623950862088}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5687.226690375013, "val/train_update_time": 2798.1550497164135, "val/loss": 5.147608817131834, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.42907125299098, "val/val_tokens_per_second": 342965.0718226966, "val/loss_avg_len_2048": 5.147608817131834, "val/perplexity_len_2048": 172.01966765914332, "val/loss_avg_len_1024": 5.162196564418997, "val/perplexity_len_1024": 174.547439521781, "val/loss_avg_len_512": 5.191514030500082, "val/perplexity_len_512": 179.74047959546354}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5919.139419794024, "val/train_update_time": 2909.933540413331, "val/loss": 5.129140133706461, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.83275818795664, "val/val_tokens_per_second": 341809.7072901768, "val/loss_avg_len_2048": 5.129140133706461, "val/perplexity_len_2048": 168.87184838724093, "val/loss_avg_len_1024": 5.143809148180205, "val/perplexity_len_1024": 171.3672900632081, "val/loss_avg_len_512": 5.173024764407822, "val/perplexity_len_512": 176.447743975283}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6151.005747379037, "val/train_update_time": 3021.723491590412, "val/loss": 5.1072978981880475, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.4659342149971, "val/val_tokens_per_second": 340013.1353889487, "val/loss_avg_len_2048": 5.1072978981880475, "val/perplexity_len_2048": 165.22330097222542, "val/loss_avg_len_1024": 5.122188884299453, "val/perplexity_len_1024": 167.70204854687952, "val/loss_avg_len_512": 5.15172206159496, "val/perplexity_len_512": 172.72868378337427}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6383.517213010986, "val/train_update_time": 3133.5136128164013, "val/loss": 5.090300521257392, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.48005827597808, "val/val_tokens_per_second": 339973.27513051854, "val/loss_avg_len_2048": 5.090300521257392, "val/perplexity_len_2048": 162.43867099271574, "val/loss_avg_len_1024": 5.105379784501816, "val/perplexity_len_1024": 164.90668764459724, "val/loss_avg_len_512": 5.135113706233085, "val/perplexity_len_512": 169.88363560477018}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6616.055601358006, "val/train_update_time": 3245.322524867661, "val/loss": 5.074609349017905, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.29630511597497, "val/val_tokens_per_second": 340492.58587378374, "val/loss_avg_len_2048": 5.074609349017905, "val/perplexity_len_2048": 159.90971089042583, "val/loss_avg_len_1024": 5.089726012247521, "val/perplexity_len_1024": 162.34537531484884, "val/loss_avg_len_512": 5.119521015206934, "val/perplexity_len_512": 167.2552377162275}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6848.3877059380175, "val/train_update_time": 3357.13471879164, "val/loss": 5.057983288034002, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.72075532499002, "val/val_tokens_per_second": 339295.4251299403, "val/loss_avg_len_2048": 5.057983288034002, "val/perplexity_len_2048": 157.27302189838034, "val/loss_avg_len_1024": 5.073276809682278, "val/perplexity_len_1024": 159.69676682020017, "val/loss_avg_len_512": 5.1034254083821775, "val/perplexity_len_512": 164.5847126848665}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 7081.627107631008, "val/train_update_time": 3468.931253584684, "val/loss": 5.044277466451659, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.07575890101725, "val/val_tokens_per_second": 341117.97730768286, "val/loss_avg_len_2048": 5.044277466451659, "val/perplexity_len_2048": 155.13217049725282, "val/loss_avg_len_1024": 5.059701495861052, "val/perplexity_len_1024": 157.54348192231643, "val/loss_avg_len_512": 5.089907858864603, "val/perplexity_len_512": 162.37489995654545}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7313.765329002985, "val/train_update_time": 3580.7367782525835, "val/loss": 5.0319846531080845, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.13104758301051, "val/val_tokens_per_second": 340960.9823946358, "val/loss_avg_len_2048": 5.0319846531080845, "val/perplexity_len_2048": 153.23683306388372, "val/loss_avg_len_1024": 5.047668003283535, "val/perplexity_len_1024": 155.6590445232944, "val/loss_avg_len_512": 5.078129063933541, "val/perplexity_len_512": 160.4735391603105}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7545.990838972037, "val/train_update_time": 3692.5671064984635, "val/loss": 5.020685842652529, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.5224018379813, "val/val_tokens_per_second": 339853.8311164979, "val/loss_avg_len_2048": 5.020685842652529, "val/perplexity_len_2048": 151.5151837428477, "val/loss_avg_len_1024": 5.03638942723212, "val/perplexity_len_1024": 153.91329543926756, "val/loss_avg_len_512": 5.066985254941543, "val/perplexity_len_512": 158.69517994303604}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7778.552717441984, "val/train_update_time": 3804.3707672305172, "val/loss": 5.009975375909938, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.43973159999587, "val/val_tokens_per_second": 340087.10793242423, "val/loss_avg_len_2048": 5.009975375909938, "val/perplexity_len_2048": 149.90104492676969, "val/loss_avg_len_1024": 5.025792396103987, "val/perplexity_len_1024": 152.29088301658857, "val/loss_avg_len_512": 5.056605560239509, "val/perplexity_len_512": 157.0564916788882}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 8010.996608729009, "val/train_update_time": 3916.130362140422, "val/loss": 5.000331946871419, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.7698862789548, "val/val_tokens_per_second": 339157.3947945137, "val/loss_avg_len_2048": 5.000331946871419, "val/perplexity_len_2048": 148.4624325640513, "val/loss_avg_len_1024": 5.016125007033599, "val/perplexity_len_1024": 150.82572135347417, "val/loss_avg_len_512": 5.046963756926056, "val/perplexity_len_512": 155.5494607996784}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8244.260649731033, "val/train_update_time": 4027.906490651425, "val/loss": 4.992123881059772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 121.04618034599116, "val/val_tokens_per_second": 338383.2507801765, "val/loss_avg_len_2048": 4.992123881059772, "val/perplexity_len_2048": 147.2488306229025, "val/loss_avg_len_1024": 5.008033923187299, "val/perplexity_len_1024": 149.61030145864262, "val/loss_avg_len_512": 5.038996019480436, "val/perplexity_len_512": 154.31500796500313}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8477.364581940987, "val/train_update_time": 4139.6967294025235, "val/loss": 4.9849577407377925, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.92127310601063, "val/val_tokens_per_second": 338732.78826704645, "val/loss_avg_len_2048": 4.9849577407377925, "val/perplexity_len_2048": 146.1973967014911, "val/loss_avg_len_1024": 5.000919279863674, "val/perplexity_len_1024": 148.54965506062427, "val/loss_avg_len_512": 5.031968218916911, "val/perplexity_len_512": 153.23431476116755}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8710.28124093404, "val/train_update_time": 4251.451832082472, "val/loss": 4.978439289365397, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.10382062901044, "val/val_tokens_per_second": 341038.27659672574, "val/loss_avg_len_2048": 4.978439289365397, "val/perplexity_len_2048": 145.24751532149992, "val/loss_avg_len_1024": 4.994448879486923, "val/perplexity_len_1024": 147.5915822165599, "val/loss_avg_len_512": 5.025531688477309, "val/perplexity_len_512": 152.25118479695178}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 8942.332405220019, "val/train_update_time": 4363.189962119563, "val/loss": 4.972828367076373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.59670256101526, "val/val_tokens_per_second": 339644.44408649154, "val/loss_avg_len_2048": 4.972828367076373, "val/perplexity_len_2048": 144.43482490386887, "val/loss_avg_len_1024": 4.988867791219574, "val/perplexity_len_1024": 146.7701549299752, "val/loss_avg_len_512": 5.020086908638769, "val/perplexity_len_512": 151.4244633161895}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9174.941196507018, "val/train_update_time": 4474.963137161569, "val/loss": 4.968042541342369, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.18285846401704, "val/val_tokens_per_second": 340813.99397122423, "val/loss_avg_len_2048": 4.968042541342369, "val/perplexity_len_2048": 143.74523644325927, "val/loss_avg_len_1024": 4.984183913346676, "val/perplexity_len_1024": 146.08430891227124, "val/loss_avg_len_512": 5.015469340964639, "val/perplexity_len_512": 150.7268624584094}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9407.59549859399, "val/train_update_time": 4586.732141316519, "val/loss": 4.963817747580077, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.84763080696575, "val/val_tokens_per_second": 341767.29005158885, "val/loss_avg_len_2048": 4.963817747580077, "val/perplexity_len_2048": 143.1392235062003, "val/loss_avg_len_1024": 4.98003164209972, "val/perplexity_len_1024": 145.4789848413279, "val/loss_avg_len_512": 5.01136137723279, "val/perplexity_len_512": 150.10895201980566}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9639.468442777987, "val/train_update_time": 4698.514196849486, "val/loss": 4.960097752921781, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.45295924000675, "val/val_tokens_per_second": 342896.4862871461, "val/loss_avg_len_2048": 4.960097752921781, "val/perplexity_len_2048": 142.6077355384751, "val/loss_avg_len_1024": 4.976341741883511, "val/perplexity_len_1024": 144.94317106134073, "val/loss_avg_len_512": 5.007765855770442, "val/perplexity_len_512": 149.57020118662368}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 9870.947561467998, "val/train_update_time": 4810.2732713416335, "val/loss": 4.957366871821497, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.02587283303728, "val/val_tokens_per_second": 341259.75536106, "val/loss_avg_len_2048": 4.957366871821497, "val/perplexity_len_2048": 142.21882204868862, "val/loss_avg_len_1024": 4.973634772009083, "val/perplexity_len_1024": 144.5513448339354, "val/loss_avg_len_512": 5.005063210593537, "val/perplexity_len_512": 149.16651176413532}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 10103.00234679901, "val/train_update_time": 4922.056656240777, "val/loss": 4.955103045992324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.5372970669996, "val/val_tokens_per_second": 342654.560584905, "val/loss_avg_len_2048": 4.955103045992324, "val/perplexity_len_2048": 141.89722756024008, "val/loss_avg_len_1024": 4.971366092304205, "val/perplexity_len_1024": 144.22377584665176, "val/loss_avg_len_512": 5.0028089391355755, "val/perplexity_len_512": 148.8306286822433}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10334.590985039016, "val/train_update_time": 5033.849530185806, "val/loss": 4.953492992541195, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.82422458700603, "val/val_tokens_per_second": 341834.05017787847, "val/loss_avg_len_2048": 4.953492992541195, "val/perplexity_len_2048": 141.66894925874792, "val/loss_avg_len_1024": 4.969803081032343, "val/perplexity_len_1024": 143.99852853723146, "val/loss_avg_len_512": 5.00130308859892, "val/perplexity_len_512": 148.60668065861563}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10566.927748315036, "val/train_update_time": 5145.645859024662, "val/loss": 4.952207647149178, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.30899883102393, "val/val_tokens_per_second": 343310.23142697907, "val/loss_avg_len_2048": 4.952207647149178, "val/perplexity_len_2048": 141.4869727040432, "val/loss_avg_len_1024": 4.968534963625809, "val/perplexity_len_1024": 143.81603723133458, "val/loss_avg_len_512": 5.000053355490358, "val/perplexity_len_512": 148.42107797071182}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 10798.264622143994, "val/train_update_time": 5257.438326367526, "val/loss": 4.951416557332166, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.87131762300851, "val/val_tokens_per_second": 341699.7561403129, "val/loss_avg_len_2048": 4.951416557332166, "val/perplexity_len_2048": 141.3750880619327, "val/loss_avg_len_1024": 4.967739155666985, "val/perplexity_len_1024": 143.7016328123107, "val/loss_avg_len_512": 4.999264901770174, "val/perplexity_len_512": 148.30410094117912}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 11030.159157333022, "val/train_update_time": 5369.213506219559, "val/loss": 4.950958369477766, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.70726062700851, "val/val_tokens_per_second": 342168.0505046872, "val/loss_avg_len_2048": 4.950958369477766, "val/perplexity_len_2048": 141.3103265512697, "val/loss_avg_len_1024": 4.967282502107741, "val/perplexity_len_1024": 143.6360259311658, "val/loss_avg_len_512": 4.998807998296409, "val/perplexity_len_512": 148.23635575996838}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 11261.909377036034, "val/train_update_time": 5481.002296196413, "val/loss": 4.950755261772388, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.94873680203455, "val/val_tokens_per_second": 341479.210969942, "val/loss_avg_len_2048": 4.950755261772388, "val/perplexity_len_2048": 141.28162824961944, "val/loss_avg_len_1024": 4.967067979426496, "val/perplexity_len_1024": 143.60521605058682, "val/loss_avg_len_512": 4.998585561082139, "val/perplexity_len_512": 148.20338614491018}
metrics/npz/train_eval/step-000000104857600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:844d2f397f2e90042ce9a56ae3b082d27ba5526d9b59ee11aeb9391e9a9e557c
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d5eeaf619fff91f6769b36b947bb6151be71389b99d3965e5fe8d305cfc80b
3
  size 20540
metrics/npz/train_eval/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d5c1df2e5ee8970ecffecaaf04b8b6b5726d2d851aef320b7759920525395c5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bcdafb18d49f2053eb816fee2f41057351c153f6f62ebab5f0f3a1254268e41
3
  size 20540
metrics/npz/train_eval/step-000000314572800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbfad28bc5b99f2d7ebe71b0008a3aca73bcbae080b6631eac72aca6f400cda7
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:502b14d4469fdbc008fd51c6f231291e0db14eea1bf21b86d7668ac994cb2817
3
  size 20540
metrics/npz/train_eval/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:833ead51c44a67bec82dcbed825eb3ea96e59829650dde8664960b14439c67d9
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f828ddbf4a9e273f5a0db9f7474f8a1186b20a34caf64667b1a93a3b351056fa
3
  size 20540
metrics/npz/train_eval/step-000000524288000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e5d315c0ffe7b125be2c7bcc8f11328f8a64e9b67971cf4d5d45daa446edfa5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:792cc1830dd5ddadf8d7a660f31057469491412804fdec8574e15f508939bcca
3
  size 20540
metrics/npz/train_eval/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdbb76286ee3a4a9f67fb11efd73493026655e7d5e500cab2447955447e6f5a2
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:654395b896450d4a2d4c113bdd12d8e5a699ba194a000090d198f0f001e4b09d
3
  size 20540
metrics/npz/train_eval/step-000000734003200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:530fa73ef9b87bbc25680c482d3e7f5b1f60a7ecfbc0563486fe893e933356d4
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32907bf38b7fe93b2344baf03c17af64b78055744f7eebaf8bf3b0a70ca43d77
3
  size 20540
metrics/npz/train_eval/step-000000838860800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75c7e9323d8a2df8d53e0cbf25babf3890f63e75a9c9c54a3b05a4bedc9ef256
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ff22b7cfe3b44873d2335db04e807858d7611e560c1ff8e94ef9900106c4e3d
3
  size 20540
metrics/npz/train_eval/step-000000943718400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05bedb07e89bd6206336843f688617ed8f16ee0c56dd4beee0e1fdd3380eafa5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d021f561209e579353a2c6d14f1a606e34e108583cdf3e1b68be7ce42013ebd
3
  size 20540
metrics/npz/train_eval/step-000001048576000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e62719076437db17461984ebee33dba48ae7242486f5d3cce55e0baa638b89e
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55190da526839675bbce5fa18ed9efbbebe5a1609c5a4c3e7f1b444981a9d93f
3
  size 20540
metrics/npz/train_eval/step-000001153433600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0721df25a318512ca3a419888a41e67abe4bc092d426127257b02239c1a8d45c
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0d567b1c640f784b7b006b46ea9d4dbddc410cc0285762ebff006d5eb5c780c
3
  size 20540
metrics/npz/train_eval/step-000001258291200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf02d5004e970106742c74bc41c8dbeb5ca028025e99d34cd1049cb85cf48dbf
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:960ac0d80bbfa4c9a71b46e1ae1549317fe62a6dd66f542043386b7df251b9c0
3
  size 20540
metrics/npz/train_eval/step-000001363148800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7035653180417693fc5ceaec527b8257ce4b13e8cccf9ce563b6645b65e214d5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9486583dd0d5a62ab506e30cdf5110f952832d8dfeb0db666298e1d2079eb163
3
  size 20540
metrics/npz/train_eval/step-000001468006400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d90a81d7d1b1b4e6a0daa5a605e62acca4ff82867df2fc67900a4bd74831caa
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a422f4d160c9929f6e68890b9d4b80561160c9df6d2e2238434fcad6c32dcdd
3
  size 20540
metrics/npz/train_eval/step-000001572864000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c182e1aad81cf4192d1267d3be4e5bf99282e514119651fa12d838437e02e942
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7720a0b84777d788204d20cefd26ba14427b8df1243e10d67e2dbfe56e797470
3
  size 20540
metrics/npz/train_eval/step-000001677721600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b509da8b751a74518306b5f681837a87effa21b18c2d5565b6e8d10814ccc4cb
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fff9dfa360488578ac18416a711f4db207adf4d48dfcf535f477aca3d8611d5
3
  size 20540
metrics/npz/train_eval/step-000001782579200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:913c6eb246cc628e77e941f66aac49bc1fa800724fe452a362ca03dfa384da71
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57380069d4cd91076bdeca446fed3d4f796745db9fafaed9258def379abb69af
3
  size 20540
metrics/npz/train_eval/step-000001887436800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08e8fda6b7729b1c0b886a1e0bf34f406e4d0757d9e3dc74c795e180e64db165
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c34352c17bd384bc40993e43e8d44a9ddfd209b4bd167880f05fb0bb709ebf88
3
  size 20540
metrics/npz/train_eval/step-000001992294400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60070d12b787f09b09e739f442a4077697e7e4c83693cdfb784949c88fcd4d00
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2949c354c795ba5a99983e2acad7fb79efdc3dcf50ff5783af2faa60a1cdcb04
3
  size 20540
metrics/npz/val/step-000000041943040.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfebeac487303b67c2ed9dbc29a46790f4044c81f1b0fa9e756109899f4a41b4
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c33eb8d1dc48547f714dba1ec5b673736a84a48004cb857195490f33b40c9abf
3
  size 21142
metrics/npz/val/step-000000083886080.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad428600afc0c2a19aaa28194fa8d87070c35865dbf0b59f3cfa43e1a3b73677
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0ce5f774bb97ff2d2e4856c9ebdb054d9bdf8b1ad052f46999d2b501a16dc33
3
  size 21142
metrics/npz/val/step-000000125829120.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a68d7101646328ca677fec3f43b160abb2d31bd5572cfb40e4d49d17f56abc5c
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:308d5fda9f42be72c08eb8fccb6954800855f5374a72b528cbfc348248f80507
3
  size 21142
metrics/npz/val/step-000000167772160.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50353e56332d895a2c56996d78b31be819d1574f10e26e2d687b1f2d58dd86cd
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93fe6335aa4aef7c61d06213c7a2a60ecef7dc728def4a395836f6932fefe16e
3
  size 21142
metrics/npz/val/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59170a9f40bdee1c0d9908201065cb821e4a287d73847766e6015036a9787254
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:857e15df6aea28826696d304204e473913a5c7061f325544325008f0375f654f
3
  size 21142
metrics/npz/val/step-000000251658240.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f97c094076c407b8c0b27cc2388ca1a040db868c92e288e4b8ad6874888aef75
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f68781f2420a289ce64f25cf1c4582230b5f340ac015164a674759d456a2d07b
3
  size 21142
metrics/npz/val/step-000000293601280.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:784729bbf65e39bc30fec2da97ec718f73836b68fff543495b4f59741586490c
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38c4752df0469938251e6fb8cb69bcac6fad05876fae1e28d61e2a674e563220
3
  size 21142
metrics/npz/val/step-000000335544320.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d30329151bb3a4a09e74433abc3a882426c84b45b6e5c44331d059747cf6c4a0
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56033fd9457d915002c9e53bdf90c7dea4f2227d0800a42883eee1c5d0846c5a
3
  size 21142
metrics/npz/val/step-000000377487360.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08c91a966b5e95af1fe60dfe2cbafed9980a61331204cb595dd1cb56822e4c49
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d98a0d33f36a94ede2c69a2a5443fa54323dc2a5a5ff513c7dea712c868e4570
3
  size 21142
metrics/npz/val/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e055c460effca7b29ca2ec367722ba86165922d36c274881db9bc53eaa2237c
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14b0f2f70cc3c2f96dcb3cdaa74e1e8bdfa9978c03f30cf9695214e2fdc560c0
3
  size 21142
metrics/npz/val/step-000000461373440.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47c9208d1632caafe1e446cffbc7b3f811ba09d27fedee5c0cf74c0102755374
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:699afee4a320a16acf289e945afd023859070c7978788c12f5c4d9f39752b31c
3
  size 21142
metrics/npz/val/step-000000503316480.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a7e5f3774da4645dfd512ee214b767f86d135270ce006398acdd6e7f2ee294b
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf802899cd7020d3e12ed513a722405d30325561ba4fe7df8447a5637df6a37a
3
  size 21142