Lanni-ni commited on
Commit
a4c9066
·
verified ·
1 Parent(s): be7da77

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoints/step-000000209715200.pt +2 -2
  2. checkpoints/step-000000419430400.pt +2 -2
  3. checkpoints/step-000000629145600.pt +2 -2
  4. checkpoints/step-000000838860800.pt +2 -2
  5. checkpoints/step-000001048576000.pt +2 -2
  6. checkpoints/step-000001258291200.pt +2 -2
  7. checkpoints/step-000001468006400.pt +2 -2
  8. checkpoints/step-000001677721600.pt +2 -2
  9. checkpoints/step-000001887436800.pt +2 -2
  10. logs/2025-10-28_04-46-01.log +258 -0
  11. metrics/jsonlines/checkpoint.jsonl +9 -9
  12. metrics/jsonlines/norm.jsonl +0 -0
  13. metrics/jsonlines/throughput.jsonl +0 -0
  14. metrics/jsonlines/train.jsonl +98 -98
  15. metrics/jsonlines/train_eval.jsonl +19 -19
  16. metrics/jsonlines/val.jsonl +49 -49
  17. metrics/npz/train_eval/step-000000104857600.npz +1 -1
  18. metrics/npz/train_eval/step-000000209715200.npz +1 -1
  19. metrics/npz/train_eval/step-000000314572800.npz +1 -1
  20. metrics/npz/train_eval/step-000000419430400.npz +1 -1
  21. metrics/npz/train_eval/step-000000524288000.npz +1 -1
  22. metrics/npz/train_eval/step-000000629145600.npz +1 -1
  23. metrics/npz/train_eval/step-000000734003200.npz +1 -1
  24. metrics/npz/train_eval/step-000000838860800.npz +1 -1
  25. metrics/npz/train_eval/step-000000943718400.npz +1 -1
  26. metrics/npz/train_eval/step-000001048576000.npz +1 -1
  27. metrics/npz/train_eval/step-000001153433600.npz +1 -1
  28. metrics/npz/train_eval/step-000001258291200.npz +1 -1
  29. metrics/npz/train_eval/step-000001363148800.npz +1 -1
  30. metrics/npz/train_eval/step-000001468006400.npz +1 -1
  31. metrics/npz/train_eval/step-000001572864000.npz +1 -1
  32. metrics/npz/train_eval/step-000001677721600.npz +1 -1
  33. metrics/npz/train_eval/step-000001782579200.npz +1 -1
  34. metrics/npz/train_eval/step-000001887436800.npz +1 -1
  35. metrics/npz/train_eval/step-000001992294400.npz +1 -1
  36. metrics/npz/val/step-000000041943040.npz +1 -1
  37. metrics/npz/val/step-000000083886080.npz +1 -1
  38. metrics/npz/val/step-000000125829120.npz +1 -1
  39. metrics/npz/val/step-000000167772160.npz +1 -1
  40. metrics/npz/val/step-000000209715200.npz +1 -1
  41. metrics/npz/val/step-000000251658240.npz +1 -1
  42. metrics/npz/val/step-000000293601280.npz +1 -1
  43. metrics/npz/val/step-000000335544320.npz +1 -1
  44. metrics/npz/val/step-000000377487360.npz +1 -1
  45. metrics/npz/val/step-000000419430400.npz +1 -1
  46. metrics/npz/val/step-000000461373440.npz +1 -1
  47. metrics/npz/val/step-000000503316480.npz +1 -1
  48. metrics/npz/val/step-000000545259520.npz +1 -1
  49. metrics/npz/val/step-000000587202560.npz +1 -1
  50. metrics/npz/val/step-000000629145600.npz +1 -1
checkpoints/step-000000209715200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c34944a1550c6f5600f84cc7882dca2919d13ce4c4e9fae926e63dfb344f3c70
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac7cd25dae4d940b58ffaafa680de2ae0c45fff95f04f5b6bc6b4476a7a2034a
3
+ size 339650634
checkpoints/step-000000419430400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd341d9a4c51929b84dc8a73a22b61206c2d03a5214fa4ee13193ca5925b0e25
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec37c0723da099092a621ff3ba55aee2e0188811a9c519bdfccad886a91e2a28
3
+ size 339650634
checkpoints/step-000000629145600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8722648f1c59174f8099ce91704394fc8cd1823334d3d796f5c9819609f797ac
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebefceb30583dd247c6dde4b265be6dc08351a8e715ded1816b0bfb461c90b81
3
+ size 339650634
checkpoints/step-000000838860800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecf7abc065199334f11b6c1bf1f08612efe39dad0be4bcd78568c180f3a1dd0c
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd061a114516812d0e8647aec34a4d751592228aede425a7ef1bd4448e3b3988
3
+ size 339650634
checkpoints/step-000001048576000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df6fdc4b678b92f48e22e1d1b9f5927eed7fa23928a20d7443734f87874c425f
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed20ede365ff3caab637f683462cb33d745b91f8e5805463520dd0d4fe343a57
3
+ size 339650634
checkpoints/step-000001258291200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:194ca13ea0e7718dc7526b0b6cadc94d274756747a7dc69c64e09c7cbbda7591
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59e0dafe1c8d7ec1fc8e05118a3dc7eb9c3bf7e740f72ef2ea38923e36287366
3
+ size 339650634
checkpoints/step-000001468006400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d21e896616380f8e2813f69492bd9555e73c0b071bc701ed7cfe191c3ac8a2ea
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c33a92e6f8f02bddbab0c570e12617e75c846b54170e6f743ffeeb2fe8e559aa
3
+ size 339650634
checkpoints/step-000001677721600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ece9f09ed797f107b247e982651935265c541ad1a99fa2fa9688b6f0b311020f
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:479347aabec04b87dc513c6cdb01c20472deb8549e1ce8fe69ade13b18d9cc67
3
+ size 339650634
checkpoints/step-000001887436800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:901dc8652d0257fdd73aa03f429d6c0e89b8f942bbd65181146e6d3acd3d574d
3
- size 339650826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e33ee7a09b18f77b4d3220a55cc5b0c9db4d86fb4c43b1ccdecd688ae2c8184f
3
+ size 339650634
logs/2025-10-28_04-46-01.log ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-28 04:46:01][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_3_4_256`
2
+ [2025-10-28 04:46:01][train:375][INFO] Configuration:
3
+ [2025-10-28 04:46:01][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_3_4_256/config.yaml.
4
+ [2025-10-28 04:46:01][train:387][INFO] creating datamodule
5
+ [2025-10-28 04:46:01][train:419][INFO] creating model
6
+ [2025-10-28 04:46:02][train:440][INFO] creating optimizer
7
+ [2025-10-28 04:46:02][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-28 04:46:02][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-28 04:46:02][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-28 04:46:03][logger:288][INFO] wandb initialized. Run id: 0jt2hdhs
11
+ [2025-10-28 04:46:03][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-28 04:46:03][logger:113][INFO] Setting up npz logger...
13
+ [2025-10-28 04:46:03][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
14
+ [2025-10-28 04:46:03][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
15
+ [2025-10-28 04:46:03][logger:171][INFO] [step: 0] [model_info/total_params: 28299520] [model_info/trainable_params: 28299520] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 15428608]
16
+ [2025-10-28 04:49:38][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:03:32] [ETA: 5:50:43] [loss: 9.776] [tokens/s: 100709.527] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000]
17
+ [2025-10-28 04:52:17][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:06:11] [ETA: 5:03:11] [loss: 8.175] [tokens/s: 115124.074] [batches/s: 0.055] [MFU: 0.000] [TFLOPS: 0.000]
18
+ [2025-10-28 04:52:17][train:194][INFO] Running validation...
19
+ [2025-10-28 04:56:45][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 371.256] [val/train_update_time: 370.747] [val/loss: 8.075] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 268.567] [val/val_tokens_per_second: 152513.300] [val/loss_avg_len_2048: 8.075] [val/perplexity_len_2048: 3211.761] [val/loss_avg_len_1024: 8.077] [val/perplexity_len_1024: 3220.412] [val/loss_avg_len_512: 8.078] [val/perplexity_len_512: 3222.268]
20
+ [2025-10-28 05:00:04][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:13:58] [ETA: 7:31:48] [loss: 7.702] [tokens/s: 74781.993] [batches/s: 0.036] [MFU: 0.000] [TFLOPS: 0.000]
21
+ [2025-10-28 05:02:36][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:16:29] [ETA: 6:35:57] [loss: 7.456] [tokens/s: 84778.614] [batches/s: 0.040] [MFU: 0.000] [TFLOPS: 0.000]
22
+ [2025-10-28 05:02:36][train:194][INFO] Running validation...
23
+ [2025-10-28 05:07:21][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 989.879] [val/train_update_time: 720.451] [val/loss: 7.447] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 285.629] [val/val_tokens_per_second: 143403.004] [val/loss_avg_len_2048: 7.447] [val/perplexity_len_2048: 1714.777] [val/loss_avg_len_1024: 7.451] [val/perplexity_len_1024: 1721.292] [val/loss_avg_len_512: 7.453] [val/perplexity_len_512: 1725.579]
24
+ [2025-10-28 05:10:20][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:24:14] [ETA: 7:40:35] [loss: 7.305] [tokens/s: 71892.276] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
25
+ [2025-10-28 05:10:20][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1454.511] [train_eval/train_update_time: 899.264] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.403] [train_eval/perplexity_len_2048: 4459.633] [train_eval/loss_avg_len_1024: 8.407] [train_eval/perplexity_len_1024: 4479.883] [train_eval/loss_avg_len_512: 8.407] [train_eval/perplexity_len_512: 4480.109]
26
+ [2025-10-28 05:12:52][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:26:46] [ETA: 6:59:23] [loss: 7.140] [tokens/s: 78261.419] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
27
+ [2025-10-28 05:12:52][train:194][INFO] Running validation...
28
+ [2025-10-28 05:17:57][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 1606.152] [val/train_update_time: 1050.734] [val/loss: 7.138] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 304.819] [val/val_tokens_per_second: 134374.611] [val/loss_avg_len_2048: 7.138] [val/perplexity_len_2048: 1259.536] [val/loss_avg_len_1024: 7.144] [val/perplexity_len_1024: 1265.973] [val/loss_avg_len_512: 7.149] [val/perplexity_len_512: 1272.675]
29
+ [2025-10-28 05:20:32][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:34:26] [ETA: 7:37:39] [loss: 7.048] [tokens/s: 70873.110] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
30
+ [2025-10-28 05:23:04][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:36:58] [ETA: 7:05:12] [loss: 6.937] [tokens/s: 75536.064] [batches/s: 0.036] [MFU: 0.000] [TFLOPS: 0.000]
31
+ [2025-10-28 05:23:04][train:194][INFO] Running validation...
32
+ [2025-10-28 05:25:07][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 2218.467] [val/train_update_time: 1357.901] [val/loss: 6.919] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 122.971] [val/val_tokens_per_second: 333085.335] [val/loss_avg_len_2048: 6.919] [val/perplexity_len_2048: 1010.819] [val/loss_avg_len_1024: 6.925] [val/perplexity_len_1024: 1017.180] [val/loss_avg_len_512: 6.932] [val/perplexity_len_512: 1024.379]
33
+ [2025-10-28 05:26:17][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:40:11] [ETA: 6:46:23] [loss: 6.817] [tokens/s: 78213.305] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
34
+ [2025-10-28 05:27:27][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:41:21] [ETA: 6:12:14] [loss: 6.740] [tokens/s: 84519.766] [batches/s: 0.040] [MFU: 0.000] [TFLOPS: 0.000]
35
+ [2025-10-28 05:27:27][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2481.588] [train_eval/train_update_time: 1497.782] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.995] [train_eval/perplexity_len_2048: 1091.206] [train_eval/loss_avg_len_1024: 7.002] [train_eval/perplexity_len_1024: 1098.950] [train_eval/loss_avg_len_512: 7.007] [train_eval/perplexity_len_512: 1104.537]
36
+ [2025-10-28 05:27:27][train:194][INFO] Running validation...
37
+ [2025-10-28 05:29:26][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 2481.588] [val/train_update_time: 1497.782] [val/loss: 6.731] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.639] [val/val_tokens_per_second: 345248.821] [val/loss_avg_len_2048: 6.731] [val/perplexity_len_2048: 838.186] [val/loss_avg_len_1024: 6.739] [val/perplexity_len_1024: 844.594] [val/loss_avg_len_512: 6.748] [val/perplexity_len_512: 852.353]
38
+ [2025-10-28 05:29:26][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000209715200.pt...
39
+ [2025-10-28 05:29:26][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000209715200.pt.
40
+ [2025-10-28 05:29:26][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.458]
41
+ [2025-10-28 05:30:36][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:44:30] [ETA: 6:00:08] [loss: 6.661] [tokens/s: 85187.454] [batches/s: 0.041] [MFU: 0.000] [TFLOPS: 0.000]
42
+ [2025-10-28 05:31:46][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:45:40] [ETA: 5:34:59] [loss: 6.557] [tokens/s: 99809.612] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000]
43
+ [2025-10-28 05:31:46][train:194][INFO] Running validation...
44
+ [2025-10-28 05:33:45][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 2740.807] [val/train_update_time: 1637.629] [val/loss: 6.574] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.797] [val/val_tokens_per_second: 344789.690] [val/loss_avg_len_2048: 6.574] [val/perplexity_len_2048: 716.130] [val/loss_avg_len_1024: 6.583] [val/perplexity_len_1024: 722.547] [val/loss_avg_len_512: 6.594] [val/perplexity_len_512: 730.627]
45
+ [2025-10-28 05:34:55][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:48:49] [ETA: 5:26:46] [loss: 6.521] [tokens/s: 100003.213] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000]
46
+ [2025-10-28 05:36:05][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:49:59] [ETA: 5:07:06] [loss: 6.479] [tokens/s: 121898.441] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
47
+ [2025-10-28 05:36:05][train:194][INFO] Running validation...
48
+ [2025-10-28 05:38:04][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 2999.716] [val/train_update_time: 1777.493] [val/loss: 6.450] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.528] [val/val_tokens_per_second: 345572.819] [val/loss_avg_len_2048: 6.450] [val/perplexity_len_2048: 632.679] [val/loss_avg_len_1024: 6.460] [val/perplexity_len_1024: 638.988] [val/loss_avg_len_512: 6.473] [val/perplexity_len_512: 647.175]
49
+ [2025-10-28 05:39:14][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:53:08] [ETA: 5:01:06] [loss: 6.371] [tokens/s: 120805.809] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
50
+ [2025-10-28 05:39:14][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3188.291] [train_eval/train_update_time: 1847.413] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.550] [train_eval/perplexity_len_2048: 699.420] [train_eval/loss_avg_len_1024: 6.559] [train_eval/perplexity_len_1024: 705.273] [train_eval/loss_avg_len_512: 6.569] [train_eval/perplexity_len_512: 712.794]
51
+ [2025-10-28 05:40:24][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:54:18] [ETA: 4:45:06] [loss: 6.371] [tokens/s: 156363.365] [batches/s: 0.075] [MFU: 0.000] [TFLOPS: 0.000]
52
+ [2025-10-28 05:40:24][train:194][INFO] Running validation...
53
+ [2025-10-28 05:42:23][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 3258.343] [val/train_update_time: 1917.340] [val/loss: 6.350] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.164] [val/val_tokens_per_second: 343728.122] [val/loss_avg_len_2048: 6.350] [val/perplexity_len_2048: 572.748] [val/loss_avg_len_1024: 6.361] [val/perplexity_len_1024: 578.982] [val/loss_avg_len_512: 6.375] [val/perplexity_len_512: 587.102]
54
+ [2025-10-28 05:43:33][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:57:27] [ETA: 4:40:32] [loss: 6.268] [tokens/s: 152047.726] [batches/s: 0.073] [MFU: 0.000] [TFLOPS: 0.000]
55
+ [2025-10-28 05:44:43][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:58:37] [ETA: 4:27:04] [loss: 6.241] [tokens/s: 177579.011] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
56
+ [2025-10-28 05:44:43][train:194][INFO] Running validation...
57
+ [2025-10-28 05:46:42][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 3517.635] [val/train_update_time: 2057.194] [val/loss: 6.245] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.162] [val/val_tokens_per_second: 343734.292] [val/loss_avg_len_2048: 6.245] [val/perplexity_len_2048: 515.461] [val/loss_avg_len_1024: 6.257] [val/perplexity_len_1024: 521.559] [val/loss_avg_len_512: 6.273] [val/perplexity_len_512: 529.853]
58
+ [2025-10-28 05:47:52][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 1:01:46] [ETA: 4:23:22] [loss: 6.249] [tokens/s: 161153.155] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
59
+ [2025-10-28 05:49:03][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 1:02:56] [ETA: 4:11:47] [loss: 6.173] [tokens/s: 177572.226] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
60
+ [2025-10-28 05:49:03][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3776.912] [train_eval/train_update_time: 2197.046] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.273] [train_eval/perplexity_len_2048: 530.052] [train_eval/loss_avg_len_1024: 6.284] [train_eval/perplexity_len_1024: 536.160] [train_eval/loss_avg_len_512: 6.299] [train_eval/perplexity_len_512: 543.777]
61
+ [2025-10-28 05:49:03][train:194][INFO] Running validation...
62
+ [2025-10-28 05:51:02][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 3776.912] [val/train_update_time: 2197.046] [val/loss: 6.165] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.318] [val/val_tokens_per_second: 343285.098] [val/loss_avg_len_2048: 6.165] [val/perplexity_len_2048: 475.950] [val/loss_avg_len_1024: 6.178] [val/perplexity_len_1024: 481.995] [val/loss_avg_len_512: 6.195] [val/perplexity_len_512: 490.469]
63
+ [2025-10-28 05:51:02][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000419430400.pt...
64
+ [2025-10-28 05:51:02][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000419430400.pt.
65
+ [2025-10-28 05:51:02][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.445]
66
+ [2025-10-28 05:52:12][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 1:06:06] [ETA: 4:08:42] [loss: 6.121] [tokens/s: 161071.815] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
67
+ [2025-10-28 05:53:22][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 1:07:16] [ETA: 3:58:32] [loss: 6.109] [tokens/s: 177424.503] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
68
+ [2025-10-28 05:53:22][train:194][INFO] Running validation...
69
+ [2025-10-28 05:55:22][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 4036.800] [val/train_update_time: 2336.909] [val/loss: 6.096] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.275] [val/val_tokens_per_second: 343408.828] [val/loss_avg_len_2048: 6.096] [val/perplexity_len_2048: 444.103] [val/loss_avg_len_1024: 6.110] [val/perplexity_len_1024: 450.165] [val/loss_avg_len_512: 6.129] [val/perplexity_len_512: 458.791]
70
+ [2025-10-28 05:56:32][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 1:10:26] [ETA: 3:55:48] [loss: 6.068] [tokens/s: 161014.445] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
71
+ [2025-10-28 05:57:42][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 1:11:36] [ETA: 3:46:44] [loss: 6.020] [tokens/s: 177313.805] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
72
+ [2025-10-28 05:57:42][train:194][INFO] Running validation...
73
+ [2025-10-28 05:59:41][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 4296.169] [val/train_update_time: 2476.763] [val/loss: 6.035] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.149] [val/val_tokens_per_second: 343770.380] [val/loss_avg_len_2048: 6.035] [val/perplexity_len_2048: 417.991] [val/loss_avg_len_1024: 6.050] [val/perplexity_len_1024: 424.000] [val/loss_avg_len_512: 6.070] [val/perplexity_len_512: 432.644]
74
+ [2025-10-28 06:00:51][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 1:14:45] [ETA: 3:44:16] [loss: 6.010] [tokens/s: 160932.666] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
75
+ [2025-10-28 06:00:51][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4485.400] [train_eval/train_update_time: 2546.702] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.082] [train_eval/perplexity_len_2048: 438.072] [train_eval/loss_avg_len_1024: 6.093] [train_eval/perplexity_len_1024: 442.860] [train_eval/loss_avg_len_512: 6.110] [train_eval/perplexity_len_512: 450.263]
76
+ [2025-10-28 06:02:01][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 1:15:55] [ETA: 3:36:05] [loss: 5.976] [tokens/s: 177309.095] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
77
+ [2025-10-28 06:02:01][train:194][INFO] Running validation...
78
+ [2025-10-28 06:04:03][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 4555.463] [val/train_update_time: 2616.634] [val/loss: 5.979] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 121.943] [val/val_tokens_per_second: 335894.395] [val/loss_avg_len_2048: 5.979] [val/perplexity_len_2048: 395.024] [val/loss_avg_len_1024: 5.994] [val/perplexity_len_1024: 401.092] [val/loss_avg_len_512: 6.016] [val/perplexity_len_512: 409.896]
79
+ [2025-10-28 06:05:13][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 1:19:07] [ETA: 3:33:55] [loss: 5.971] [tokens/s: 160581.929] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
80
+ [2025-10-28 06:06:23][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 1:20:17] [ETA: 3:26:28] [loss: 5.917] [tokens/s: 176879.876] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
81
+ [2025-10-28 06:06:23][train:194][INFO] Running validation...
82
+ [2025-10-28 06:08:23][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 4817.595] [val/train_update_time: 2756.546] [val/loss: 5.932] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.646] [val/val_tokens_per_second: 342343.152] [val/loss_avg_len_2048: 5.932] [val/perplexity_len_2048: 376.835] [val/loss_avg_len_1024: 5.948] [val/perplexity_len_1024: 382.882] [val/loss_avg_len_512: 5.970] [val/perplexity_len_512: 391.653]
83
+ [2025-10-28 06:09:33][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 1:23:27] [ETA: 3:24:19] [loss: 5.883] [tokens/s: 160515.361] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
84
+ [2025-10-28 06:10:43][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 1:24:37] [ETA: 3:17:27] [loss: 5.869] [tokens/s: 176891.777] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
85
+ [2025-10-28 06:10:43][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5077.398] [train_eval/train_update_time: 2896.437] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.951] [train_eval/perplexity_len_2048: 383.983] [train_eval/loss_avg_len_1024: 5.962] [train_eval/perplexity_len_1024: 388.223] [train_eval/loss_avg_len_512: 5.980] [train_eval/perplexity_len_512: 395.637]
86
+ [2025-10-28 06:10:43][train:194][INFO] Running validation...
87
+ [2025-10-28 06:12:42][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 5077.398] [val/train_update_time: 2896.437] [val/loss: 5.892] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.627] [val/val_tokens_per_second: 345285.072] [val/loss_avg_len_2048: 5.892] [val/perplexity_len_2048: 362.147] [val/loss_avg_len_1024: 5.908] [val/perplexity_len_1024: 368.146] [val/loss_avg_len_512: 5.932] [val/perplexity_len_512: 376.982]
88
+ [2025-10-28 06:12:42][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000629145600.pt...
89
+ [2025-10-28 06:12:42][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000629145600.pt.
90
+ [2025-10-28 06:12:42][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.544]
91
+ [2025-10-28 06:13:52][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 1:27:46] [ETA: 3:15:22] [loss: 5.913] [tokens/s: 160585.573] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
92
+ [2025-10-28 06:15:02][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 1:28:56] [ETA: 3:09:00] [loss: 5.830] [tokens/s: 176905.674] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
93
+ [2025-10-28 06:15:02][train:194][INFO] Running validation...
94
+ [2025-10-28 06:17:01][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 5336.697] [val/train_update_time: 3036.310] [val/loss: 5.856] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.724] [val/val_tokens_per_second: 345001.836] [val/loss_avg_len_2048: 5.856] [val/perplexity_len_2048: 349.308] [val/loss_avg_len_1024: 5.873] [val/perplexity_len_1024: 355.337] [val/loss_avg_len_512: 5.898] [val/perplexity_len_512: 364.266]
95
+ [2025-10-28 06:18:11][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:32:05] [ETA: 3:06:58] [loss: 5.871] [tokens/s: 160653.230] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
96
+ [2025-10-28 06:19:21][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:33:15] [ETA: 3:01:01] [loss: 5.832] [tokens/s: 176970.345] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
97
+ [2025-10-28 06:19:21][train:194][INFO] Running validation...
98
+ [2025-10-28 06:21:20][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 5595.524] [val/train_update_time: 3176.177] [val/loss: 5.820] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.232] [val/val_tokens_per_second: 343530.836] [val/loss_avg_len_2048: 5.820] [val/perplexity_len_2048: 337.058] [val/loss_avg_len_1024: 5.838] [val/perplexity_len_1024: 343.055] [val/loss_avg_len_512: 5.864] [val/perplexity_len_512: 352.054]
99
+ [2025-10-28 06:22:30][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:36:24] [ETA: 2:59:03] [loss: 5.805] [tokens/s: 160645.682] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
100
+ [2025-10-28 06:22:30][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5784.817] [train_eval/train_update_time: 3246.117] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.849] [train_eval/perplexity_len_2048: 346.889] [train_eval/loss_avg_len_1024: 5.864] [train_eval/perplexity_len_1024: 352.065] [train_eval/loss_avg_len_512: 5.886] [train_eval/perplexity_len_512: 360.116]
101
+ [2025-10-28 06:23:40][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:37:34] [ETA: 2:53:28] [loss: 5.737] [tokens/s: 177387.786] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
102
+ [2025-10-28 06:23:40][train:194][INFO] Running validation...
103
+ [2025-10-28 06:25:39][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 5854.861] [val/train_update_time: 3316.049] [val/loss: 5.786] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.498] [val/val_tokens_per_second: 345660.914] [val/loss_avg_len_2048: 5.786] [val/perplexity_len_2048: 325.726] [val/loss_avg_len_1024: 5.804] [val/perplexity_len_1024: 331.746] [val/loss_avg_len_512: 5.831] [val/perplexity_len_512: 340.795]
104
+ [2025-10-28 06:26:49][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:40:43] [ETA: 2:51:30] [loss: 5.770] [tokens/s: 161081.300] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
105
+ [2025-10-28 06:27:59][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:41:53] [ETA: 2:46:14] [loss: 5.743] [tokens/s: 177572.913] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
106
+ [2025-10-28 06:27:59][train:194][INFO] Running validation...
107
+ [2025-10-28 06:29:58][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 6113.463] [val/train_update_time: 3455.915] [val/loss: 5.760] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.623] [val/val_tokens_per_second: 345295.402] [val/loss_avg_len_2048: 5.760] [val/perplexity_len_2048: 317.312] [val/loss_avg_len_1024: 5.779] [val/perplexity_len_1024: 323.292] [val/loss_avg_len_512: 5.806] [val/perplexity_len_512: 332.440]
108
+ [2025-10-28 06:31:08][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:45:02] [ETA: 2:44:17] [loss: 5.759] [tokens/s: 161218.799] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
109
+ [2025-10-28 06:32:18][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:46:12] [ETA: 2:39:18] [loss: 5.662] [tokens/s: 177665.244] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
110
+ [2025-10-28 06:32:18][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6372.185] [train_eval/train_update_time: 3595.786] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.767] [train_eval/perplexity_len_2048: 319.631] [train_eval/loss_avg_len_1024: 5.781] [train_eval/perplexity_len_1024: 324.122] [train_eval/loss_avg_len_512: 5.805] [train_eval/perplexity_len_512: 332.034]
111
+ [2025-10-28 06:32:18][train:194][INFO] Running validation...
112
+ [2025-10-28 06:34:16][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 6372.185] [val/train_update_time: 3595.786] [val/loss: 5.730] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.578] [val/val_tokens_per_second: 345425.448] [val/loss_avg_len_2048: 5.730] [val/perplexity_len_2048: 307.986] [val/loss_avg_len_1024: 5.749] [val/perplexity_len_1024: 314.022] [val/loss_avg_len_512: 5.778] [val/perplexity_len_512: 323.269]
113
+ [2025-10-28 06:34:16][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000838860800.pt...
114
+ [2025-10-28 06:34:17][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000838860800.pt.
115
+ [2025-10-28 06:34:17][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.604]
116
+ [2025-10-28 06:35:27][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:49:21] [ETA: 2:37:22] [loss: 5.682] [tokens/s: 161223.493] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
117
+ [2025-10-28 06:36:37][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:50:31] [ETA: 2:32:37] [loss: 5.698] [tokens/s: 177600.917] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
118
+ [2025-10-28 06:36:37][train:194][INFO] Running validation...
119
+ [2025-10-28 06:38:36][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 6631.452] [val/train_update_time: 3735.645] [val/loss: 5.707] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.904] [val/val_tokens_per_second: 344480.790] [val/loss_avg_len_2048: 5.707] [val/perplexity_len_2048: 301.034] [val/loss_avg_len_1024: 5.727] [val/perplexity_len_1024: 307.110] [val/loss_avg_len_512: 5.757] [val/perplexity_len_512: 316.405]
120
+ [2025-10-28 06:39:46][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:53:40] [ETA: 2:30:41] [loss: 5.693] [tokens/s: 161204.037] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
121
+ [2025-10-28 06:40:56][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:54:50] [ETA: 2:26:09] [loss: 5.727] [tokens/s: 177655.013] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
122
+ [2025-10-28 06:40:56][train:194][INFO] Running validation...
123
+ [2025-10-28 06:42:54][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 6890.436] [val/train_update_time: 3875.508] [val/loss: 5.684] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.320] [val/val_tokens_per_second: 346178.621] [val/loss_avg_len_2048: 5.684] [val/perplexity_len_2048: 294.055] [val/loss_avg_len_1024: 5.704] [val/perplexity_len_1024: 300.147] [val/loss_avg_len_512: 5.735] [val/perplexity_len_512: 309.552]
124
+ [2025-10-28 06:44:04][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:57:58] [ETA: 2:24:11] [loss: 5.670] [tokens/s: 161322.240] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
125
+ [2025-10-28 06:44:04][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7078.807] [train_eval/train_update_time: 3945.447] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.700] [train_eval/perplexity_len_2048: 298.773] [train_eval/loss_avg_len_1024: 5.718] [train_eval/perplexity_len_1024: 304.258] [train_eval/loss_avg_len_512: 5.748] [train_eval/perplexity_len_512: 313.427]
126
+ [2025-10-28 06:45:14][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:59:08] [ETA: 2:19:52] [loss: 5.644] [tokens/s: 177682.841] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
127
+ [2025-10-28 06:45:14][train:194][INFO] Running validation...
128
+ [2025-10-28 06:47:13][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 7148.851] [val/train_update_time: 4015.378] [val/loss: 5.663] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.155] [val/val_tokens_per_second: 346662.196] [val/loss_avg_len_2048: 5.663] [val/perplexity_len_2048: 288.120] [val/loss_avg_len_1024: 5.684] [val/perplexity_len_1024: 294.203] [val/loss_avg_len_512: 5.716] [val/perplexity_len_512: 303.632]
129
+ [2025-10-28 06:48:23][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 2:02:17] [ETA: 2:17:53] [loss: 5.657] [tokens/s: 161364.601] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
130
+ [2025-10-28 06:49:33][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 2:03:27] [ETA: 2:13:44] [loss: 5.642] [tokens/s: 177744.973] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
131
+ [2025-10-28 06:49:33][train:194][INFO] Running validation...
132
+ [2025-10-28 06:51:31][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 7407.170] [val/train_update_time: 4155.290] [val/loss: 5.643] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.502] [val/val_tokens_per_second: 345647.934] [val/loss_avg_len_2048: 5.643] [val/perplexity_len_2048: 282.384] [val/loss_avg_len_1024: 5.665] [val/perplexity_len_1024: 288.474] [val/loss_avg_len_512: 5.697] [val/perplexity_len_512: 297.980]
133
+ [2025-10-28 06:52:41][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 2:06:35] [ETA: 2:11:45] [loss: 5.637] [tokens/s: 161371.491] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
134
+ [2025-10-28 06:53:51][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 2:07:45] [ETA: 2:07:45] [loss: 5.624] [tokens/s: 177846.101] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
135
+ [2025-10-28 06:53:51][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7665.790] [train_eval/train_update_time: 4295.186] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.647] [train_eval/perplexity_len_2048: 283.438] [train_eval/loss_avg_len_1024: 5.664] [train_eval/perplexity_len_1024: 288.382] [train_eval/loss_avg_len_512: 5.696] [train_eval/perplexity_len_512: 297.565]
136
+ [2025-10-28 06:53:51][train:194][INFO] Running validation...
137
+ [2025-10-28 06:55:50][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 7665.790] [val/train_update_time: 4295.186] [val/loss: 5.623] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.537] [val/val_tokens_per_second: 345545.927] [val/loss_avg_len_2048: 5.623] [val/perplexity_len_2048: 276.635] [val/loss_avg_len_1024: 5.645] [val/perplexity_len_1024: 282.750] [val/loss_avg_len_512: 5.678] [val/perplexity_len_512: 292.337]
138
+ [2025-10-28 06:55:50][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001048576000.pt...
139
+ [2025-10-28 06:55:50][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001048576000.pt.
140
+ [2025-10-28 06:55:50][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.543]
141
+ [2025-10-28 06:57:01][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 2:10:54] [ETA: 2:05:46] [loss: 5.619] [tokens/s: 161381.450] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
142
+ [2025-10-28 06:58:11][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 2:12:04] [ETA: 2:01:55] [loss: 5.616] [tokens/s: 177814.118] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
143
+ [2025-10-28 06:58:11][train:194][INFO] Running validation...
144
+ [2025-10-28 07:00:09][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 7924.985] [val/train_update_time: 4435.058] [val/loss: 5.605] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.866] [val/val_tokens_per_second: 344589.614] [val/loss_avg_len_2048: 5.605] [val/perplexity_len_2048: 271.704] [val/loss_avg_len_1024: 5.627] [val/perplexity_len_1024: 277.840] [val/loss_avg_len_512: 5.661] [val/perplexity_len_512: 287.461]
145
+ [2025-10-28 07:01:20][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 2:15:13] [ETA: 1:59:55] [loss: 5.587] [tokens/s: 161381.358] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
146
+ [2025-10-28 07:02:30][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 2:16:23] [ETA: 1:56:11] [loss: 5.596] [tokens/s: 177722.933] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
147
+ [2025-10-28 07:02:30][train:194][INFO] Running validation...
148
+ [2025-10-28 07:04:28][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 8183.985] [val/train_update_time: 4574.958] [val/loss: 5.590] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.373] [val/val_tokens_per_second: 346026.041] [val/loss_avg_len_2048: 5.590] [val/perplexity_len_2048: 267.758] [val/loss_avg_len_1024: 5.613] [val/perplexity_len_1024: 273.949] [val/loss_avg_len_512: 5.648] [val/perplexity_len_512: 283.660]
149
+ [2025-10-28 07:05:38][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 2:19:32] [ETA: 1:54:10] [loss: 5.539] [tokens/s: 161368.454] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
150
+ [2025-10-28 07:05:38][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8372.430] [train_eval/train_update_time: 4644.910] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.597] [train_eval/perplexity_len_2048: 269.687] [train_eval/loss_avg_len_1024: 5.614] [train_eval/perplexity_len_1024: 274.126] [train_eval/loss_avg_len_512: 5.644] [train_eval/perplexity_len_512: 282.581]
151
+ [2025-10-28 07:06:48][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 2:20:42] [ETA: 1:50:33] [loss: 5.579] [tokens/s: 177684.218] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
152
+ [2025-10-28 07:06:48][train:194][INFO] Running validation...
153
+ [2025-10-28 07:08:47][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 8442.486] [val/train_update_time: 4714.853] [val/loss: 5.574] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.529] [val/val_tokens_per_second: 345569.440] [val/loss_avg_len_2048: 5.574] [val/perplexity_len_2048: 263.571] [val/loss_avg_len_1024: 5.598] [val/perplexity_len_1024: 269.753] [val/loss_avg_len_512: 5.633] [val/perplexity_len_512: 279.508]
154
+ [2025-10-28 07:09:57][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 2:23:51] [ETA: 1:48:31] [loss: 5.533] [tokens/s: 161319.510] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
155
+ [2025-10-28 07:11:07][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 2:25:01] [ETA: 1:45:00] [loss: 5.593] [tokens/s: 177683.161] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
156
+ [2025-10-28 07:11:07][train:194][INFO] Running validation...
157
+ [2025-10-28 07:13:05][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 8701.162] [val/train_update_time: 4854.753] [val/loss: 5.560] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.597] [val/val_tokens_per_second: 345370.152] [val/loss_avg_len_2048: 5.560] [val/perplexity_len_2048: 259.857] [val/loss_avg_len_1024: 5.584] [val/perplexity_len_1024: 266.071] [val/loss_avg_len_512: 5.620] [val/perplexity_len_512: 275.967]
158
+ [2025-10-28 07:14:15][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 2:28:09] [ETA: 1:42:57] [loss: 5.593] [tokens/s: 161307.065] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
159
+ [2025-10-28 07:15:26][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 2:29:19] [ETA: 1:39:33] [loss: 5.584] [tokens/s: 177749.840] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
160
+ [2025-10-28 07:15:26][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8959.925] [train_eval/train_update_time: 4994.663] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.558] [train_eval/perplexity_len_2048: 259.341] [train_eval/loss_avg_len_1024: 5.575] [train_eval/perplexity_len_1024: 263.874] [train_eval/loss_avg_len_512: 5.609] [train_eval/perplexity_len_512: 272.868]
161
+ [2025-10-28 07:15:26][train:194][INFO] Running validation...
162
+ [2025-10-28 07:17:25][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 8959.925] [val/train_update_time: 4994.663] [val/loss: 5.547] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.351] [val/val_tokens_per_second: 343189.523] [val/loss_avg_len_2048: 5.547] [val/perplexity_len_2048: 256.395] [val/loss_avg_len_1024: 5.571] [val/perplexity_len_1024: 262.651] [val/loss_avg_len_512: 5.608] [val/perplexity_len_512: 272.576]
163
+ [2025-10-28 07:17:25][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001258291200.pt...
164
+ [2025-10-28 07:17:25][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001258291200.pt.
165
+ [2025-10-28 07:17:25][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.559]
166
+ [2025-10-28 07:18:36][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 2:32:29] [ETA: 1:37:29] [loss: 5.559] [tokens/s: 161196.762] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
167
+ [2025-10-28 07:19:46][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 2:33:40] [ETA: 1:34:10] [loss: 5.528] [tokens/s: 177582.347] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
168
+ [2025-10-28 07:19:46][train:194][INFO] Running validation...
169
+ [2025-10-28 07:21:44][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 9220.005] [val/train_update_time: 5134.569] [val/loss: 5.535] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.741] [val/val_tokens_per_second: 344952.514] [val/loss_avg_len_2048: 5.535] [val/perplexity_len_2048: 253.336] [val/loss_avg_len_1024: 5.559] [val/perplexity_len_1024: 259.605] [val/loss_avg_len_512: 5.597] [val/perplexity_len_512: 269.570]
170
+ [2025-10-28 07:22:54][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 2:36:48] [ETA: 1:32:05] [loss: 5.550] [tokens/s: 161206.714] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
171
+ [2025-10-28 07:24:05][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:37:58] [ETA: 1:28:51] [loss: 5.538] [tokens/s: 177521.301] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
172
+ [2025-10-28 07:24:05][train:194][INFO] Running validation...
173
+ [2025-10-28 07:26:03][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 9478.915] [val/train_update_time: 5274.478] [val/loss: 5.524] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.794] [val/val_tokens_per_second: 344799.429] [val/loss_avg_len_2048: 5.524] [val/perplexity_len_2048: 250.611] [val/loss_avg_len_1024: 5.549] [val/perplexity_len_1024: 256.925] [val/loss_avg_len_512: 5.587] [val/perplexity_len_512: 266.969]
174
+ [2025-10-28 07:27:13][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:41:07] [ETA: 1:26:45] [loss: 5.505] [tokens/s: 161149.799] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
175
+ [2025-10-28 07:27:13][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9667.798] [train_eval/train_update_time: 5344.432] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.535] [train_eval/perplexity_len_2048: 253.349] [train_eval/loss_avg_len_1024: 5.558] [train_eval/perplexity_len_1024: 259.206] [train_eval/loss_avg_len_512: 5.592] [train_eval/perplexity_len_512: 268.353]
176
+ [2025-10-28 07:28:23][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:42:17] [ETA: 1:23:36] [loss: 5.532] [tokens/s: 177473.598] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
177
+ [2025-10-28 07:28:23][train:194][INFO] Running validation...
178
+ [2025-10-28 07:30:22][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 9737.888] [val/train_update_time: 5414.394] [val/loss: 5.514] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.890] [val/val_tokens_per_second: 344518.785] [val/loss_avg_len_2048: 5.514] [val/perplexity_len_2048: 248.078] [val/loss_avg_len_1024: 5.539] [val/perplexity_len_1024: 254.381] [val/loss_avg_len_512: 5.578] [val/perplexity_len_512: 264.476]
179
+ [2025-10-28 07:31:32][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 2:45:26] [ETA: 1:21:29] [loss: 5.486] [tokens/s: 161097.977] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
180
+ [2025-10-28 07:32:43][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 2:46:36] [ETA: 1:18:24] [loss: 5.502] [tokens/s: 177423.386] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
181
+ [2025-10-28 07:32:43][train:194][INFO] Running validation...
182
+ [2025-10-28 07:34:41][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 9996.966] [val/train_update_time: 5554.316] [val/loss: 5.504] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.394] [val/val_tokens_per_second: 345963.792] [val/loss_avg_len_2048: 5.504] [val/perplexity_len_2048: 245.566] [val/loss_avg_len_1024: 5.529] [val/perplexity_len_1024: 251.922] [val/loss_avg_len_512: 5.569] [val/perplexity_len_512: 262.109]
183
+ [2025-10-28 07:35:51][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 2:49:45] [ETA: 1:16:16] [loss: 5.512] [tokens/s: 161120.440] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
184
+ [2025-10-28 07:37:01][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 2:50:55] [ETA: 1:13:15] [loss: 5.508] [tokens/s: 177649.813] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
185
+ [2025-10-28 07:37:01][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10255.549] [train_eval/train_update_time: 5694.240] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.504] [train_eval/perplexity_len_2048: 245.735] [train_eval/loss_avg_len_1024: 5.527] [train_eval/perplexity_len_1024: 251.494] [train_eval/loss_avg_len_512: 5.567] [train_eval/perplexity_len_512: 261.543]
186
+ [2025-10-28 07:37:01][train:194][INFO] Running validation...
187
+ [2025-10-28 07:39:00][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 10255.549] [val/train_update_time: 5694.240] [val/loss: 5.495] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.524] [val/val_tokens_per_second: 345583.861] [val/loss_avg_len_2048: 5.495] [val/perplexity_len_2048: 243.565] [val/loss_avg_len_1024: 5.521] [val/perplexity_len_1024: 249.950] [val/loss_avg_len_512: 5.561] [val/perplexity_len_512: 260.184]
188
+ [2025-10-28 07:39:00][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001468006400.pt...
189
+ [2025-10-28 07:39:00][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001468006400.pt.
190
+ [2025-10-28 07:39:00][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.553]
191
+ [2025-10-28 07:40:10][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:54:04] [ETA: 1:11:06] [loss: 5.497] [tokens/s: 161223.245] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
192
+ [2025-10-28 07:41:20][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:55:14] [ETA: 1:08:09] [loss: 5.489] [tokens/s: 177600.962] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
193
+ [2025-10-28 07:41:20][train:194][INFO] Running validation...
194
+ [2025-10-28 07:43:19][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 10514.781] [val/train_update_time: 5834.131] [val/loss: 5.487] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.976] [val/val_tokens_per_second: 344270.742] [val/loss_avg_len_2048: 5.487] [val/perplexity_len_2048: 241.540] [val/loss_avg_len_1024: 5.513] [val/perplexity_len_1024: 247.922] [val/loss_avg_len_512: 5.554] [val/perplexity_len_512: 258.172]
195
+ [2025-10-28 07:44:29][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:58:23] [ETA: 1:05:58] [loss: 5.506] [tokens/s: 161195.843] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
196
+ [2025-10-28 07:45:40][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:59:33] [ETA: 1:03:05] [loss: 5.501] [tokens/s: 177576.339] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
197
+ [2025-10-28 07:45:40][train:194][INFO] Running validation...
198
+ [2025-10-28 07:47:39][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 10773.905] [val/train_update_time: 5974.028] [val/loss: 5.480] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.072] [val/val_tokens_per_second: 343994.445] [val/loss_avg_len_2048: 5.480] [val/perplexity_len_2048: 239.927] [val/loss_avg_len_1024: 5.507] [val/perplexity_len_1024: 246.330] [val/loss_avg_len_512: 5.548] [val/perplexity_len_512: 256.613]
199
+ [2025-10-28 07:48:49][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 3:02:43] [ETA: 1:00:54] [loss: 5.496] [tokens/s: 161165.619] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
200
+ [2025-10-28 07:48:49][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10963.046] [train_eval/train_update_time: 6043.979] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.485] [train_eval/perplexity_len_2048: 241.052] [train_eval/loss_avg_len_1024: 5.509] [train_eval/perplexity_len_1024: 246.929] [train_eval/loss_avg_len_512: 5.548] [train_eval/perplexity_len_512: 256.828]
201
+ [2025-10-28 07:49:59][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 3:03:53] [ETA: 0:58:04] [loss: 5.435] [tokens/s: 177556.099] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
202
+ [2025-10-28 07:49:59][train:194][INFO] Running validation...
203
+ [2025-10-28 07:51:58][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 11033.109] [val/train_update_time: 6113.922] [val/loss: 5.474] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.073] [val/val_tokens_per_second: 343991.270] [val/loss_avg_len_2048: 5.474] [val/perplexity_len_2048: 238.456] [val/loss_avg_len_1024: 5.501] [val/perplexity_len_1024: 244.888] [val/loss_avg_len_512: 5.542] [val/perplexity_len_512: 255.240]
204
+ [2025-10-28 07:53:08][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 3:07:02] [ETA: 0:55:52] [loss: 5.515] [tokens/s: 161147.907] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
205
+ [2025-10-28 07:54:18][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 3:08:12] [ETA: 0:53:05] [loss: 5.426] [tokens/s: 177461.509] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
206
+ [2025-10-28 07:54:18][train:194][INFO] Running validation...
207
+ [2025-10-28 07:56:16][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 11292.313] [val/train_update_time: 6253.814] [val/loss: 5.468] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.412] [val/val_tokens_per_second: 345911.180] [val/loss_avg_len_2048: 5.468] [val/perplexity_len_2048: 237.079] [val/loss_avg_len_1024: 5.495] [val/perplexity_len_1024: 243.509] [val/loss_avg_len_512: 5.537] [val/perplexity_len_512: 253.897]
208
+ [2025-10-28 07:57:26][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 3:11:20] [ETA: 0:50:51] [loss: 5.461] [tokens/s: 161153.387] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
209
+ [2025-10-28 07:58:36][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 3:12:30] [ETA: 0:48:07] [loss: 5.436] [tokens/s: 177570.720] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
210
+ [2025-10-28 07:58:36][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 11550.860] [train_eval/train_update_time: 6393.711] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.469] [train_eval/perplexity_len_2048: 237.322] [train_eval/loss_avg_len_1024: 5.496] [train_eval/perplexity_len_1024: 243.612] [train_eval/loss_avg_len_512: 5.535] [train_eval/perplexity_len_512: 253.363]
211
+ [2025-10-28 07:58:36][train:194][INFO] Running validation...
212
+ [2025-10-28 08:00:35][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 11550.860] [val/train_update_time: 6393.711] [val/loss: 5.464] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.559] [val/val_tokens_per_second: 345481.948] [val/loss_avg_len_2048: 5.464] [val/perplexity_len_2048: 235.953] [val/loss_avg_len_1024: 5.491] [val/perplexity_len_1024: 242.389] [val/loss_avg_len_512: 5.533] [val/perplexity_len_512: 252.785]
213
+ [2025-10-28 08:00:35][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001677721600.pt...
214
+ [2025-10-28 08:00:36][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001677721600.pt.
215
+ [2025-10-28 08:00:36][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.549]
216
+ [2025-10-28 08:01:46][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 3:15:40] [ETA: 0:45:53] [loss: 5.406] [tokens/s: 161154.567] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
217
+ [2025-10-28 08:02:56][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 3:16:50] [ETA: 0:43:12] [loss: 5.442] [tokens/s: 177553.144] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
218
+ [2025-10-28 08:02:56][train:194][INFO] Running validation...
219
+ [2025-10-28 08:04:54][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 11810.108] [val/train_update_time: 6533.614] [val/loss: 5.459] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.309] [val/val_tokens_per_second: 346211.923] [val/loss_avg_len_2048: 5.459] [val/perplexity_len_2048: 234.963] [val/loss_avg_len_1024: 5.487] [val/perplexity_len_1024: 241.416] [val/loss_avg_len_512: 5.529] [val/perplexity_len_512: 251.811]
220
+ [2025-10-28 08:06:04][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 3:19:58] [ETA: 0:40:57] [loss: 5.469] [tokens/s: 161239.906] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
221
+ [2025-10-28 08:07:14][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 3:21:08] [ETA: 0:38:18] [loss: 5.416] [tokens/s: 177669.766] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
222
+ [2025-10-28 08:07:14][train:194][INFO] Running validation...
223
+ [2025-10-28 08:09:13][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 12068.559] [val/train_update_time: 6673.514] [val/loss: 5.456] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.485] [val/val_tokens_per_second: 345697.539] [val/loss_avg_len_2048: 5.456] [val/perplexity_len_2048: 234.114] [val/loss_avg_len_1024: 5.483] [val/perplexity_len_1024: 240.583] [val/loss_avg_len_512: 5.526] [val/perplexity_len_512: 251.032]
224
+ [2025-10-28 08:10:23][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 3:24:17] [ETA: 0:36:03] [loss: 5.501] [tokens/s: 161313.112] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
225
+ [2025-10-28 08:10:23][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 12257.112] [train_eval/train_update_time: 6743.462] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.451] [train_eval/perplexity_len_2048: 232.879] [train_eval/loss_avg_len_1024: 5.470] [train_eval/perplexity_len_1024: 237.552] [train_eval/loss_avg_len_512: 5.510] [train_eval/perplexity_len_512: 247.272]
226
+ [2025-10-28 08:11:33][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 3:25:27] [ETA: 0:33:26] [loss: 5.469] [tokens/s: 177757.939] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
227
+ [2025-10-28 08:11:33][train:194][INFO] Running validation...
228
+ [2025-10-28 08:13:31][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 12327.183] [val/train_update_time: 6813.415] [val/loss: 5.453] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.348] [val/val_tokens_per_second: 346097.247] [val/loss_avg_len_2048: 5.453] [val/perplexity_len_2048: 233.464] [val/loss_avg_len_1024: 5.480] [val/perplexity_len_1024: 239.935] [val/loss_avg_len_512: 5.523] [val/perplexity_len_512: 250.383]
229
+ [2025-10-28 08:14:41][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 3:28:35] [ETA: 0:31:10] [loss: 5.406] [tokens/s: 161403.516] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
230
+ [2025-10-28 08:15:51][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 3:29:45] [ETA: 0:28:36] [loss: 5.437] [tokens/s: 177768.473] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
231
+ [2025-10-28 08:15:51][train:194][INFO] Running validation...
232
+ [2025-10-28 08:17:50][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 12585.657] [val/train_update_time: 6953.311] [val/loss: 5.451] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.068] [val/val_tokens_per_second: 344004.024] [val/loss_avg_len_2048: 5.451] [val/perplexity_len_2048: 232.918] [val/loss_avg_len_1024: 5.478] [val/perplexity_len_1024: 239.395] [val/loss_avg_len_512: 5.521] [val/perplexity_len_512: 249.859]
233
+ [2025-10-28 08:19:00][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 3:32:54] [ETA: 0:26:18] [loss: 5.503] [tokens/s: 161318.513] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
234
+ [2025-10-28 08:20:10][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 3:34:04] [ETA: 0:23:47] [loss: 5.398] [tokens/s: 177769.956] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
235
+ [2025-10-28 08:20:10][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 12844.892] [train_eval/train_update_time: 7093.223] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.450] [train_eval/perplexity_len_2048: 232.751] [train_eval/loss_avg_len_1024: 5.474] [train_eval/perplexity_len_1024: 238.384] [train_eval/loss_avg_len_512: 5.515] [train_eval/perplexity_len_512: 248.290]
236
+ [2025-10-28 08:20:11][train:194][INFO] Running validation...
237
+ [2025-10-28 08:22:10][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 12844.892] [val/train_update_time: 7093.223] [val/loss: 5.449] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.423] [val/val_tokens_per_second: 342982.810] [val/loss_avg_len_2048: 5.449] [val/perplexity_len_2048: 232.502] [val/loss_avg_len_1024: 5.476] [val/perplexity_len_1024: 238.977] [val/loss_avg_len_512: 5.519] [val/perplexity_len_512: 249.452]
238
+ [2025-10-28 08:22:10][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001887436800.pt...
239
+ [2025-10-28 08:22:10][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001887436800.pt.
240
+ [2025-10-28 08:22:10][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.558]
241
+ [2025-10-28 08:23:21][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 3:37:14] [ETA: 0:21:29] [loss: 5.444] [tokens/s: 161203.830] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
242
+ [2025-10-28 08:24:31][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 3:38:25] [ETA: 0:18:59] [loss: 5.479] [tokens/s: 177510.209] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
243
+ [2025-10-28 08:24:31][train:194][INFO] Running validation...
244
+ [2025-10-28 08:26:30][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 13105.045] [val/train_update_time: 7233.126] [val/loss: 5.448] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.240] [val/val_tokens_per_second: 343509.294] [val/loss_avg_len_2048: 5.448] [val/perplexity_len_2048: 232.214] [val/loss_avg_len_1024: 5.475] [val/perplexity_len_1024: 238.695] [val/loss_avg_len_512: 5.518] [val/perplexity_len_512: 249.166]
245
+ [2025-10-28 08:27:40][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 3:41:34] [ETA: 0:16:40] [loss: 5.456] [tokens/s: 161084.541] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
246
+ [2025-10-28 08:28:50][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 3:42:44] [ETA: 0:14:13] [loss: 5.403] [tokens/s: 177392.839] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
247
+ [2025-10-28 08:28:50][train:194][INFO] Running validation...
248
+ [2025-10-28 08:30:49][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 13364.449] [val/train_update_time: 7373.020] [val/loss: 5.447] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.281] [val/val_tokens_per_second: 343392.107] [val/loss_avg_len_2048: 5.447] [val/perplexity_len_2048: 232.003] [val/loss_avg_len_1024: 5.474] [val/perplexity_len_1024: 238.482] [val/loss_avg_len_512: 5.517] [val/perplexity_len_512: 248.962]
249
+ [2025-10-28 08:31:59][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 3:45:53] [ETA: 0:11:53] [loss: 5.440] [tokens/s: 160983.051] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
250
+ [2025-10-28 08:31:59][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 13553.814] [train_eval/train_update_time: 7442.970] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.440] [train_eval/perplexity_len_2048: 230.442] [train_eval/loss_avg_len_1024: 5.467] [train_eval/perplexity_len_1024: 236.693] [train_eval/loss_avg_len_512: 5.507] [train_eval/perplexity_len_512: 246.395]
251
+ [2025-10-28 08:33:10][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 3:47:03] [ETA: 0:09:27] [loss: 5.426] [tokens/s: 177246.769] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
252
+ [2025-10-28 08:33:10][train:194][INFO] Running validation...
253
+ [2025-10-28 08:35:08][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 13623.900] [val/train_update_time: 7512.918] [val/loss: 5.446] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.658] [val/val_tokens_per_second: 345194.412] [val/loss_avg_len_2048: 5.446] [val/perplexity_len_2048: 231.892] [val/loss_avg_len_1024: 5.474] [val/perplexity_len_1024: 238.373] [val/loss_avg_len_512: 5.517] [val/perplexity_len_512: 248.862]
254
+ [2025-10-28 08:36:18][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 3:50:12] [ETA: 0:07:07] [loss: 5.463] [tokens/s: 160942.421] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
255
+ [2025-10-28 08:37:28][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 3:51:22] [ETA: 0:04:43] [loss: 5.444] [tokens/s: 177308.988] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
256
+ [2025-10-28 08:37:28][train:194][INFO] Running validation...
257
+ [2025-10-28 08:39:28][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 13882.690] [val/train_update_time: 7652.812] [val/loss: 5.446] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.432] [val/val_tokens_per_second: 342955.416] [val/loss_avg_len_2048: 5.446] [val/perplexity_len_2048: 231.842] [val/loss_avg_len_1024: 5.474] [val/perplexity_len_1024: 238.322] [val/loss_avg_len_512: 5.517] [val/perplexity_len_512: 248.809]
258
+ [2025-10-28 08:39:28][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"step": 209715200, "checkpoint/checkpoint_time": 0.45521786995232105}
2
- {"step": 419430400, "checkpoint/checkpoint_time": 0.4552681630011648}
3
- {"step": 629145600, "checkpoint/checkpoint_time": 0.45940230099949986}
4
- {"step": 838860800, "checkpoint/checkpoint_time": 0.45928425301099196}
5
- {"step": 1048576000, "checkpoint/checkpoint_time": 0.4539535099756904}
6
- {"step": 1258291200, "checkpoint/checkpoint_time": 0.461745353997685}
7
- {"step": 1468006400, "checkpoint/checkpoint_time": 0.459751385031268}
8
- {"step": 1677721600, "checkpoint/checkpoint_time": 0.4552595349960029}
9
- {"step": 1887436800, "checkpoint/checkpoint_time": 0.45486548100598156}
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 0.4577103740302846}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 0.4446265029255301}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 0.5438607160467654}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 0.6043127420125529}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 0.5425283389631659}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 0.5587757179746404}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 0.5533553039422259}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 0.5485877189785242}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 0.5579239659709856}
metrics/jsonlines/norm.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/throughput.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl CHANGED
@@ -1,98 +1,98 @@
1
- {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 59.740394167019986, "train/update_time": 59.53266526403604, "train/lr": 0.0009000000000000001, "train/loss": 9.925932884216309, "train/global_grad_norm": 1.0933681726455688}
2
- {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 115.77270240103826, "train/update_time": 115.42706217308296, "train/lr": 0.0009997960964140947, "train/loss": 8.114099502563477, "train/global_grad_norm": 0.7164918780326843}
3
- {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 292.0277106779977, "train/update_time": 171.32016484509222, "train/lr": 0.0009990914580222257, "train/loss": 7.684152603149414, "train/global_grad_norm": 0.3018423914909363}
4
- {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 348.0203448670218, "train/update_time": 227.1939903421444, "train/lr": 0.0009978842768382998, "train/loss": 7.448395729064941, "train/global_grad_norm": 0.2099468857049942}
5
- {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 523.9415633900207, "train/update_time": 283.0836244261591, "train/lr": 0.0009961757683914405, "train/loss": 7.294851779937744, "train/global_grad_norm": 0.23000206053256989}
6
- {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 579.9503910699859, "train/update_time": 338.96602103614714, "train/lr": 0.00099396765300483, "train/loss": 7.111999988555908, "train/global_grad_norm": 0.2716911733150482}
7
- {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 756.5241781410296, "train/update_time": 394.8385707241832, "train/lr": 0.0009912621540634887, "train/loss": 6.983415126800537, "train/global_grad_norm": 0.21857409179210663}
8
- {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 812.5387418420287, "train/update_time": 450.7242119802977, "train/lr": 0.000988061995775515, "train/loss": 6.815765380859375, "train/global_grad_norm": 0.18543004989624023}
9
- {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 988.3924220129848, "train/update_time": 506.5995626472868, "train/lr": 0.0009843704004290394, "train/loss": 6.659385681152344, "train/global_grad_norm": 0.20697954297065735}
10
- {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1044.3968842840404, "train/update_time": 562.4800082092406, "train/lr": 0.0009801910851476522, "train/loss": 6.568438529968262, "train/global_grad_norm": 0.31236547231674194}
11
- {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1221.4379238740075, "train/update_time": 618.3656786112697, "train/lr": 0.0009755282581475768, "train/loss": 6.480067729949951, "train/global_grad_norm": 0.3563300371170044}
12
- {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1277.4677367979893, "train/update_time": 674.2627646423061, "train/lr": 0.0009703866145003512, "train/loss": 6.361207008361816, "train/global_grad_norm": 0.28948819637298584}
13
- {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1452.9732034530025, "train/update_time": 730.170422351337, "train/lr": 0.0009647713314052896, "train/loss": 6.323390483856201, "train/global_grad_norm": 0.4304763078689575}
14
- {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1509.0055129660177, "train/update_time": 786.0731882582186, "train/lr": 0.0009586880629764817, "train/loss": 6.252005577087402, "train/global_grad_norm": 0.22148849070072174}
15
- {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1685.596672433021, "train/update_time": 841.9682118772762, "train/lr": 0.0009521429345495787, "train/loss": 6.138335704803467, "train/global_grad_norm": 0.29878920316696167}
16
- {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1741.6182956020348, "train/update_time": 897.8648126212647, "train/lr": 0.0009451425365140996, "train/loss": 6.105388641357422, "train/global_grad_norm": 0.24416939914226532}
17
- {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1918.145444536989, "train/update_time": 953.7598241912201, "train/lr": 0.000937693917677468, "train/loss": 6.023434162139893, "train/global_grad_norm": 0.3608187139034271}
18
- {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1974.1614223060315, "train/update_time": 1009.6508660353138, "train/lr": 0.0009298045781674596, "train/loss": 5.995698928833008, "train/global_grad_norm": 0.2951180636882782}
19
- {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2149.7959963100147, "train/update_time": 1065.5460091893328, "train/lr": 0.0009214824618802108, "train/loss": 5.970148086547852, "train/global_grad_norm": 0.5581636428833008}
20
- {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2205.8133425950073, "train/update_time": 1121.4434441442718, "train/lr": 0.000912735948481387, "train/loss": 5.88069486618042, "train/global_grad_norm": 0.35760968923568726}
21
- {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2381.4569776499993, "train/update_time": 1177.318929851288, "train/lr": 0.0009035738449685707, "train/loss": 5.837252616882324, "train/global_grad_norm": 0.32113462686538696}
22
- {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2437.459500588011, "train/update_time": 1233.201419278339, "train/lr": 0.0008940053768033609, "train/loss": 5.823047637939453, "train/global_grad_norm": 0.532181441783905}
23
- {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2613.3590940189897, "train/update_time": 1289.0799032752984, "train/lr": 0.0008840401786221159, "train/loss": 5.743273735046387, "train/global_grad_norm": 0.28300613164901733}
24
- {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2669.367173767998, "train/update_time": 1344.9662226063083, "train/lr": 0.0008736882845346905, "train/loss": 5.686018466949463, "train/global_grad_norm": 0.4552343785762787}
25
- {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2845.8649667550344, "train/update_time": 1400.8440078733838, "train/lr": 0.0008629601180209381, "train/loss": 5.690680980682373, "train/global_grad_norm": 0.40284600853919983}
26
- {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2901.8806591850007, "train/update_time": 1456.7321346284007, "train/lr": 0.0008518664814351503, "train/loss": 5.6336669921875, "train/global_grad_norm": 0.35505881905555725}
27
- {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3077.9571933770203, "train/update_time": 1512.6155867513735, "train/lr": 0.0008404185451290017, "train/loss": 5.614615440368652, "train/global_grad_norm": 0.42291319370269775}
28
- {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3133.951200322015, "train/update_time": 1568.5041498313076, "train/lr": 0.0008286278362039527, "train/loss": 5.560991287231445, "train/global_grad_norm": 0.3632428050041199}
29
- {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3309.8123759650043, "train/update_time": 1624.3993004413205, "train/lr": 0.0008165062269044352, "train/loss": 5.52689266204834, "train/global_grad_norm": 0.5304650068283081}
30
- {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3365.815582766023, "train/update_time": 1680.2926608613343, "train/lr": 0.0008040659226635089, "train/loss": 5.491298198699951, "train/global_grad_norm": 0.2989009618759155}
31
- {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3542.1138300999883, "train/update_time": 1736.1879310192307, "train/lr": 0.0007913194498130252, "train/loss": 5.506411552429199, "train/global_grad_norm": 0.3645610511302948}
32
- {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3598.114992738003, "train/update_time": 1792.082390839234, "train/lr": 0.000778279642970672, "train/loss": 5.436527729034424, "train/global_grad_norm": 0.3181352913379669}
33
- {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3773.9975122280302, "train/update_time": 1847.994289233291, "train/lr": 0.0007649596321166025, "train/loss": 5.447015762329102, "train/global_grad_norm": 0.5075618624687195}
34
- {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3829.9891319620074, "train/update_time": 1903.884795576334, "train/lr": 0.0007513728293726579, "train/loss": 5.411071300506592, "train/global_grad_norm": 0.485222190618515}
35
- {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 4005.8882146670367, "train/update_time": 1959.7687862973544, "train/lr": 0.0007375329154974975, "train/loss": 5.39218282699585, "train/global_grad_norm": 0.42199867963790894}
36
- {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4061.88242925104, "train/update_time": 2015.6589334552991, "train/lr": 0.0007234538261112341, "train/loss": 5.3130316734313965, "train/global_grad_norm": 0.35227981209754944}
37
- {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4238.28049745399, "train/update_time": 2071.5370760894148, "train/lr": 0.0007091497376634464, "train/loss": 5.341176509857178, "train/global_grad_norm": 0.4442596137523651}
38
- {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4294.288298399013, "train/update_time": 2127.428580871492, "train/lr": 0.0006946350531586958, "train/loss": 5.3172173500061035, "train/global_grad_norm": 0.3961981534957886}
39
- {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4470.58636719099, "train/update_time": 2183.3099067094154, "train/lr": 0.0006799243876539214, "train/loss": 5.318506240844727, "train/global_grad_norm": 0.3649199604988098}
40
- {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4526.608443362988, "train/update_time": 2239.197950529342, "train/lr": 0.0006650325535423166, "train/loss": 5.243042945861816, "train/global_grad_norm": 0.5602830052375793}
41
- {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4703.79108641902, "train/update_time": 2295.093229125312, "train/lr": 0.0006499745456385053, "train/loss": 5.247615814208984, "train/global_grad_norm": 0.434451699256897}
42
- {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4759.794200062985, "train/update_time": 2350.9869147563586, "train/lr": 0.0006347655260800339, "train/loss": 5.253141403198242, "train/global_grad_norm": 0.6098817586898804}
43
- {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4936.540683488012, "train/update_time": 2406.895653022337, "train/lr": 0.0006194208090603844, "train/loss": 5.238434791564941, "train/global_grad_norm": 0.3986305594444275}
44
- {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4992.560281209007, "train/update_time": 2462.802181679348, "train/lr": 0.0006039558454088796, "train/loss": 5.259368419647217, "train/global_grad_norm": 0.45497700572013855}
45
- {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5168.203005189018, "train/update_time": 2518.7051250302466, "train/lr": 0.0005883862070330078, "train/loss": 5.205867767333984, "train/global_grad_norm": 0.6130543947219849}
46
- {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5224.203345166985, "train/update_time": 2574.5941846003407, "train/lr": 0.0005727275712388317, "train/loss": 5.177216053009033, "train/global_grad_norm": 0.372676819562912}
47
- {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5399.371034484007, "train/update_time": 2630.4843858853565, "train/lr": 0.0005569957049452703, "train/loss": 5.195985317230225, "train/global_grad_norm": 0.3776322305202484}
48
- {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5455.381416677032, "train/update_time": 2686.3787583513767, "train/lr": 0.0005412064488081482, "train/loss": 5.169455528259277, "train/global_grad_norm": 0.48483920097351074}
49
- {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5631.206694298016, "train/update_time": 2742.264746093366, "train/lr": 0.0005253757012699972, "train/loss": 5.1734089851379395, "train/global_grad_norm": 0.3823016583919525}
50
- {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5687.226690375013, "train/update_time": 2798.1550497164135, "train/lr": 0.0005095194025516734, "train/loss": 5.148708343505859, "train/global_grad_norm": 0.33509281277656555}
51
- {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5863.125213610998, "train/update_time": 2854.0412449103314, "train/lr": 0.0004936535186019053, "train/loss": 5.145965576171875, "train/global_grad_norm": 0.39432910084724426}
52
- {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5919.139419794024, "train/update_time": 2909.933540413331, "train/lr": 0.00047779402502093696, "train/loss": 5.137781143188477, "train/global_grad_norm": 0.510867178440094}
53
- {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6094.99239180499, "train/update_time": 2965.8256602562615, "train/lr": 0.0004619568909744525, "train/loss": 5.095966815948486, "train/global_grad_norm": 0.4091125428676605}
54
- {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6151.005747379037, "train/update_time": 3021.723491590412, "train/lr": 0.00044615806311398067, "train/loss": 5.110124588012695, "train/global_grad_norm": 0.30919671058654785}
55
- {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6327.491173934017, "train/update_time": 3077.6138937743963, "train/lr": 0.0004304134495199673, "train/loss": 5.058600425720215, "train/global_grad_norm": 0.3384024202823639}
56
- {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6383.517213010986, "train/update_time": 3133.5136128164013, "train/lr": 0.0004147389036836882, "train/loss": 5.096649646759033, "train/global_grad_norm": 0.43362703919410706}
57
- {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6560.036292392993, "train/update_time": 3189.418762697489, "train/lr": 0.0003991502085441259, "train/loss": 5.065718173980713, "train/global_grad_norm": 0.35753506422042847}
58
- {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6616.055601358006, "train/update_time": 3245.322524867661, "train/lr": 0.0003836630605958888, "train/loss": 5.097409725189209, "train/global_grad_norm": 0.2560317814350128}
59
- {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6792.385478265991, "train/update_time": 3301.2410182636813, "train/lr": 0.00036829305408417155, "train/loss": 5.073716640472412, "train/global_grad_norm": 0.33256691694259644}
60
- {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6848.3877059380175, "train/update_time": 3357.13471879164, "train/lr": 0.000353055665302672, "train/loss": 5.076870918273926, "train/global_grad_norm": 0.3488343060016632}
61
- {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 7025.611038778035, "train/update_time": 3413.0350541696534, "train/lr": 0.0003379662370102746, "train/loss": 5.060288429260254, "train/global_grad_norm": 0.2855152189731598}
62
- {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 7081.627107631008, "train/update_time": 3468.931253584684, "train/lr": 0.00032303996298219405, "train/loss": 5.036261081695557, "train/global_grad_norm": 0.35105201601982117}
63
- {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7257.753306786006, "train/update_time": 3524.844808488677, "train/lr": 0.00030829187271113034, "train/loss": 5.049121379852295, "train/global_grad_norm": 0.3490675091743469}
64
- {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7313.765329002985, "train/update_time": 3580.7367782525835, "train/lr": 0.0002937368162738445, "train/loss": 5.05193567276001, "train/global_grad_norm": 0.3178498148918152}
65
- {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7489.941271601012, "train/update_time": 3636.640745670593, "train/lr": 0.0002793894493783894, "train/loss": 5.022533416748047, "train/global_grad_norm": 0.29690152406692505}
66
- {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7545.990838972037, "train/update_time": 3692.5671064984635, "train/lr": 0.00026526421860705474, "train/loss": 5.044463157653809, "train/global_grad_norm": 0.2601265013217926}
67
- {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7722.533125129994, "train/update_time": 3748.472306053387, "train/lr": 0.0002513753468698824, "train/loss": 4.998659133911133, "train/global_grad_norm": 0.28301775455474854}
68
- {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7778.552717441984, "train/update_time": 3804.3707672305172, "train/lr": 0.00023773681908340283, "train/loss": 5.004274368286133, "train/global_grad_norm": 0.22069884836673737}
69
- {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7954.998239959998, "train/update_time": 3860.2436746915337, "train/lr": 0.00022436236808900823, "train/loss": 5.016230583190918, "train/global_grad_norm": 0.2676955461502075}
70
- {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 8010.996608729009, "train/update_time": 3916.130362140422, "train/lr": 0.00021126546082514682, "train/loss": 5.002098560333252, "train/global_grad_norm": 0.29775163531303406}
71
- {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8188.256527994003, "train/update_time": 3972.0219007354463, "train/lr": 0.00019845928476725522, "train/loss": 4.991973400115967, "train/global_grad_norm": 0.325958788394928}
72
- {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8244.260649731033, "train/update_time": 4027.906490651425, "train/lr": 0.0001859567346490913, "train/loss": 4.999189376831055, "train/global_grad_norm": 0.22328363358974457}
73
- {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8421.350605542015, "train/update_time": 4083.8140410054475, "train/lr": 0.00017377039947882782, "train/loss": 5.008745193481445, "train/global_grad_norm": 0.24172931909561157}
74
- {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8477.364581940987, "train/update_time": 4139.6967294025235, "train/lr": 0.00016191254986299043, "train/loss": 5.006008625030518, "train/global_grad_norm": 0.24032117426395416}
75
- {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8654.290324420028, "train/update_time": 4195.576348111499, "train/lr": 0.00015039512565099468, "train/loss": 4.995713233947754, "train/global_grad_norm": 0.20805297791957855}
76
- {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8710.28124093404, "train/update_time": 4251.451832082472, "train/lr": 0.00013922972391273224, "train/loss": 4.950077056884766, "train/global_grad_norm": 0.2368009090423584}
77
- {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 8886.364251475025, "train/update_time": 4307.327748217445, "train/lr": 0.00012842758726130281, "train/loss": 5.011436939239502, "train/global_grad_norm": 0.24461600184440613}
78
- {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 8942.332405220019, "train/update_time": 4363.189962119563, "train/lr": 0.00011799959253265679, "train/loss": 4.935585021972656, "train/global_grad_norm": 0.21502895653247833}
79
- {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 9118.950192205026, "train/update_time": 4419.082554180524, "train/lr": 0.00010795623983354214, "train/loss": 4.970619201660156, "train/global_grad_norm": 0.18588073551654816}
80
- {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9174.941196507018, "train/update_time": 4474.963137161569, "train/lr": 9.830764196878872e-05, "train/loss": 4.93653678894043, "train/global_grad_norm": 0.21770106256008148}
81
- {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9351.592722550035, "train/update_time": 4530.845779778552, "train/lr": 8.906351425856951e-05, "train/loss": 4.917532920837402, "train/global_grad_norm": 0.20093698799610138}
82
- {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9407.59549859399, "train/update_time": 4586.732141316519, "train/lr": 8.02331647558977e-05, "train/loss": 4.933161735534668, "train/global_grad_norm": 0.1962786465883255}
83
- {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9583.465221386985, "train/update_time": 4642.629586387542, "train/lr": 7.182548487420554e-05, "train/loss": 4.9604034423828125, "train/global_grad_norm": 0.16423940658569336}
84
- {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9639.468442777987, "train/update_time": 4698.514196849486, "train/lr": 6.384894043444556e-05, "train/loss": 4.935092926025391, "train/global_grad_norm": 0.17330293357372284}
85
- {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9814.94877268601, "train/update_time": 4754.397899497591, "train/lr": 5.6311563140726166e-05, "train/loss": 4.997830390930176, "train/global_grad_norm": 0.16490045189857483}
86
- {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 9870.947561467998, "train/update_time": 4810.2732713416335, "train/lr": 4.922094249306547e-05, "train/loss": 4.9721550941467285, "train/global_grad_norm": 0.16293860971927643}
87
- {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 10047.000041833031, "train/update_time": 4866.164181352709, "train/lr": 4.2584218145409916e-05, "train/loss": 4.91515588760376, "train/global_grad_norm": 0.1622975766658783}
88
- {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 10103.00234679901, "train/update_time": 4922.056656240777, "train/lr": 3.6408072716606236e-05, "train/loss": 4.9515910148620605, "train/global_grad_norm": 0.16384997963905334}
89
- {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10278.560454143037, "train/update_time": 4977.952506872767, "train/lr": 3.069872506157217e-05, "train/loss": 4.990846157073975, "train/global_grad_norm": 0.15534919500350952}
90
- {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10334.590985039016, "train/update_time": 5033.849530185806, "train/lr": 2.5461924009435368e-05, "train/loss": 4.921456336975098, "train/global_grad_norm": 0.16249564290046692}
91
- {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10510.921276927984, "train/update_time": 5089.760406139714, "train/lr": 2.0702942574950812e-05, "train/loss": 4.945437431335449, "train/global_grad_norm": 0.15427514910697937}
92
- {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10566.927748315036, "train/update_time": 5145.645859024662, "train/lr": 1.642657264902142e-05, "train/loss": 4.986889839172363, "train/global_grad_norm": 0.14873239398002625}
93
- {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 10742.247933529026, "train/update_time": 5201.534603161563, "train/lr": 1.2637120173670358e-05, "train/loss": 4.958912372589111, "train/global_grad_norm": 0.135355144739151}
94
- {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 10798.264622143994, "train/update_time": 5257.438326367526, "train/lr": 9.338400806321978e-06, "train/loss": 4.918151378631592, "train/global_grad_norm": 0.13301731646060944}
95
- {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 10974.142334740027, "train/update_time": 5313.31503523147, "train/lr": 6.533736077758867e-06, "train/loss": 4.9513726234436035, "train/global_grad_norm": 0.13848066329956055}
96
- {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 11030.159157333022, "train/update_time": 5369.213506219559, "train/lr": 4.2259500476214406e-06, "train/loss": 4.957179069519043, "train/global_grad_norm": 0.14191794395446777}
97
- {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 11205.89884431701, "train/update_time": 5425.1117221594905, "train/lr": 2.417366460819359e-06, "train/loss": 4.969225883483887, "train/global_grad_norm": 0.14331136643886566}
98
- {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 11261.909377036034, "train/update_time": 5481.002296196413, "train/lr": 1.1098064077174619e-06, "train/loss": 4.953543663024902, "train/global_grad_norm": 0.12667855620384216}
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 212.5578491949709, "train/update_time": 212.33471676090267, "train/lr": 0.0009000000000000001, "train/loss": 9.77643871307373, "train/global_grad_norm": 1.2205885648727417}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 371.2564815880032, "train/update_time": 370.74722005974036, "train/lr": 0.0009997960964140947, "train/loss": 8.174598693847656, "train/global_grad_norm": 0.9826189875602722}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 838.406655976898, "train/update_time": 569.1370019397, "train/lr": 0.0009990914580222257, "train/loss": 7.701515197753906, "train/global_grad_norm": 0.39929941296577454}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 989.879293700913, "train/update_time": 720.4508380297339, "train/lr": 0.0009978842768382998, "train/loss": 7.45575475692749, "train/global_grad_norm": 0.1761881709098816}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 1454.5109360769857, "train/update_time": 899.2642310586525, "train/lr": 0.0009961757683914405, "train/loss": 7.304540157318115, "train/global_grad_norm": 0.28032752871513367}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 1606.152420823928, "train/update_time": 1050.7343851425685, "train/lr": 0.00099396765300483, "train/loss": 7.139791488647461, "train/global_grad_norm": 0.18078835308551788}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 2066.870162793901, "train/update_time": 1206.4585170837818, "train/lr": 0.0009912621540634887, "train/loss": 7.047938346862793, "train/global_grad_norm": 0.3235696852207184}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 2218.4671919340035, "train/update_time": 1357.900893958984, "train/lr": 0.000988061995775515, "train/loss": 6.9367804527282715, "train/global_grad_norm": 0.257102906703949}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 2411.522138212924, "train/update_time": 1427.838058966794, "train/lr": 0.0009843704004290394, "train/loss": 6.816991329193115, "train/global_grad_norm": 0.4563130736351013}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 2481.5876440349966, "train/update_time": 1497.7818741976516, "train/lr": 0.0009801910851476522, "train/loss": 6.740283012390137, "train/global_grad_norm": 0.30055639147758484}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 2670.7504822599003, "train/update_time": 1567.7054013966117, "train/lr": 0.0009755282581475768, "train/loss": 6.661011695861816, "train/global_grad_norm": 0.5079889893531799}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 2740.80689570494, "train/update_time": 1637.629011878511, "train/lr": 0.0009703866145003512, "train/loss": 6.556557655334473, "train/global_grad_norm": 0.3374536633491516}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 2929.6757837759797, "train/update_time": 1707.567629359779, "train/lr": 0.0009647713314052896, "train/loss": 6.5208539962768555, "train/global_grad_norm": 0.22820784151554108}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 2999.716264599003, "train/update_time": 1777.492755148909, "train/lr": 0.0009586880629764817, "train/loss": 6.479208946228027, "train/global_grad_norm": 0.46022579073905945}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 3188.29143246694, "train/update_time": 1847.4134167138254, "train/lr": 0.0009521429345495787, "train/loss": 6.371499538421631, "train/global_grad_norm": 0.29645684361457825}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 3258.342541063903, "train/update_time": 1917.3395090608392, "train/lr": 0.0009451425365140996, "train/loss": 6.371063232421875, "train/global_grad_norm": 0.9106816053390503}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 3447.5834293199005, "train/update_time": 1987.271274490864, "train/lr": 0.000937693917677468, "train/loss": 6.2678022384643555, "train/global_grad_norm": 0.36450543999671936}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 3517.6349648769246, "train/update_time": 2057.1937958986964, "train/lr": 0.0009298045781674596, "train/loss": 6.241225242614746, "train/global_grad_norm": 0.30290958285331726}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 3706.861982131959, "train/update_time": 2127.122993543744, "train/lr": 0.0009214824618802108, "train/loss": 6.248615264892578, "train/global_grad_norm": 0.4511157274246216}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 3776.9122951189056, "train/update_time": 2197.046115837642, "train/lr": 0.000912735948481387, "train/loss": 6.1730451583862305, "train/global_grad_norm": 0.5304612517356873}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 3966.7439867819194, "train/update_time": 2266.9816121864133, "train/lr": 0.0009035738449685707, "train/loss": 6.1212158203125, "train/global_grad_norm": 0.3544110357761383}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 4036.799753597006, "train/update_time": 2336.9093162972713, "train/lr": 0.0008940053768033609, "train/loss": 6.1092681884765625, "train/global_grad_norm": 0.4344483017921448}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 4226.125323860906, "train/update_time": 2406.836813273141, "train/lr": 0.0008840401786221159, "train/loss": 6.067781448364258, "train/global_grad_norm": 0.3917176425457001}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 4296.169194732909, "train/update_time": 2476.7631880298723, "train/lr": 0.0008736882845346905, "train/loss": 6.019988536834717, "train/global_grad_norm": 0.422320157289505}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 4485.399678221904, "train/update_time": 2546.7019683559192, "train/lr": 0.0008629601180209381, "train/loss": 6.009823322296143, "train/global_grad_norm": 0.37123939394950867}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 4555.462784307892, "train/update_time": 2616.633614034741, "train/lr": 0.0008518664814351503, "train/loss": 5.976399898529053, "train/global_grad_norm": 0.415222704410553}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 4747.504722302896, "train/update_time": 2686.5944050095277, "train/lr": 0.0008404185451290017, "train/loss": 5.970578670501709, "train/global_grad_norm": 0.41021960973739624}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 4817.594658994931, "train/update_time": 2756.5458030648297, "train/lr": 0.0008286278362039527, "train/loss": 5.917147636413574, "train/global_grad_norm": 0.392586886882782}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 5007.321647593984, "train/update_time": 2826.489245740697, "train/lr": 0.0008165062269044352, "train/loss": 5.883110046386719, "train/global_grad_norm": 0.40099892020225525}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 5077.3983478549635, "train/update_time": 2896.4371623727493, "train/lr": 0.0008040659226635089, "train/loss": 5.869299411773682, "train/global_grad_norm": 0.3332286477088928}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 5266.6355534009635, "train/update_time": 2966.368956397753, "train/lr": 0.0007913194498130252, "train/loss": 5.913437843322754, "train/global_grad_norm": 0.3435675799846649}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 5336.696560251992, "train/update_time": 3036.3103124498157, "train/lr": 0.000778279642970672, "train/loss": 5.829973220825195, "train/global_grad_norm": 0.4446646273136139}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 5525.4722395399585, "train/update_time": 3106.2408556328155, "train/lr": 0.0007649596321166025, "train/loss": 5.8708319664001465, "train/global_grad_norm": 0.37984520196914673}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 5595.524489760981, "train/update_time": 3176.177008557832, "train/lr": 0.0007513728293726579, "train/loss": 5.832010746002197, "train/global_grad_norm": 0.45653313398361206}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 5784.816746937926, "train/update_time": 3246.116620109882, "train/lr": 0.0007375329154974975, "train/loss": 5.804886817932129, "train/global_grad_norm": 0.3863084316253662}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 5854.861399552901, "train/update_time": 3316.0491548541468, "train/lr": 0.0007234538261112341, "train/loss": 5.737117767333984, "train/global_grad_norm": 0.4480031728744507}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 6043.423031610902, "train/update_time": 3385.989506291109, "train/lr": 0.0007091497376634464, "train/loss": 5.769798755645752, "train/global_grad_norm": 0.39961710572242737}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 6113.463035934954, "train/update_time": 3455.9149778021965, "train/lr": 0.0006946350531586958, "train/loss": 5.74267578125, "train/global_grad_norm": 0.4710085391998291}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 6302.139852438006, "train/update_time": 3525.8512225542217, "train/lr": 0.0006799243876539214, "train/loss": 5.759049415588379, "train/global_grad_norm": 0.44018295407295227}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 6372.185268521891, "train/update_time": 3595.7855843111174, "train/lr": 0.0006650325535423166, "train/loss": 5.66196346282959, "train/global_grad_norm": 0.3946307301521301}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 6561.41529466596, "train/update_time": 3665.7160565470112, "train/lr": 0.0006499745456385053, "train/loss": 5.682401657104492, "train/global_grad_norm": 0.3963249623775482}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 6631.451806944911, "train/update_time": 3735.644923887099, "train/lr": 0.0006347655260800339, "train/loss": 5.6978607177734375, "train/global_grad_norm": 0.5203631520271301}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 6820.405654589995, "train/update_time": 3805.5842581341276, "train/lr": 0.0006194208090603844, "train/loss": 5.692924976348877, "train/global_grad_norm": 0.3886179029941559}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 6890.435664905934, "train/update_time": 3875.508329823031, "train/lr": 0.0006039558454088796, "train/loss": 5.726889133453369, "train/global_grad_norm": 0.4617585837841034}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 7078.806805928936, "train/update_time": 3945.447064547101, "train/lr": 0.0005883862070330078, "train/loss": 5.670354843139648, "train/global_grad_norm": 0.42168015241622925}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 7148.850932181929, "train/update_time": 4015.3776673960965, "train/lr": 0.0005727275712388317, "train/loss": 5.643796443939209, "train/global_grad_norm": 0.478407084941864}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 7337.074188055005, "train/update_time": 4085.330211903318, "train/lr": 0.0005569957049452703, "train/loss": 5.656505107879639, "train/global_grad_norm": 0.3445208668708801}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 7407.169766063918, "train/update_time": 4155.28995017719, "train/lr": 0.0005412064488081482, "train/loss": 5.64232063293457, "train/global_grad_norm": 0.5094632506370544}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 7595.736356125912, "train/update_time": 4225.240399566013, "train/lr": 0.0005253757012699972, "train/loss": 5.636730670928955, "train/global_grad_norm": 0.37454232573509216}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 7665.789839876001, "train/update_time": 4295.186003304552, "train/lr": 0.0005095194025516734, "train/loss": 5.6238884925842285, "train/global_grad_norm": 0.3839475214481354}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 7854.928577505983, "train/update_time": 4365.119778911234, "train/lr": 0.0004936535186019053, "train/loss": 5.618503093719482, "train/global_grad_norm": 0.4643455743789673}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 7924.9846870569745, "train/update_time": 4435.057907043141, "train/lr": 0.00047779402502093696, "train/loss": 5.616159915924072, "train/global_grad_norm": 0.39497897028923035}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 8113.922514467966, "train/update_time": 4505.009558844264, "train/lr": 0.0004619568909744525, "train/loss": 5.587278842926025, "train/global_grad_norm": 0.514954686164856}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 8183.984758203966, "train/update_time": 4574.958235046361, "train/lr": 0.00044615806311398067, "train/loss": 5.59569787979126, "train/global_grad_norm": 0.432810515165329}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 8372.430396109005, "train/update_time": 4644.909644760657, "train/lr": 0.0004304134495199673, "train/loss": 5.539417266845703, "train/global_grad_norm": 0.3852292597293854}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 8442.485554668936, "train/update_time": 4714.8530240497785, "train/lr": 0.0004147389036836882, "train/loss": 5.578530311584473, "train/global_grad_norm": 0.3600296974182129}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 8631.0834149539, "train/update_time": 4784.797394883935, "train/lr": 0.0003991502085441259, "train/loss": 5.533158302307129, "train/global_grad_norm": 0.48202306032180786}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 8701.162048408994, "train/update_time": 4854.753052946762, "train/lr": 0.0003836630605958888, "train/loss": 5.592592239379883, "train/global_grad_norm": 0.4211496114730835}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 8889.847440016922, "train/update_time": 4924.708595188917, "train/lr": 0.00036829305408417155, "train/loss": 5.592873573303223, "train/global_grad_norm": 0.3531559407711029}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 8959.925166409928, "train/update_time": 4994.663271273952, "train/lr": 0.000353055665302672, "train/loss": 5.583993911743164, "train/global_grad_norm": 0.379597932100296}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 9149.922014020965, "train/update_time": 5064.615168323857, "train/lr": 0.0003379662370102746, "train/loss": 5.558979511260986, "train/global_grad_norm": 0.3696227967739105}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 9220.005188893992, "train/update_time": 5134.568920494756, "train/lr": 0.00032303996298219405, "train/loss": 5.527538299560547, "train/global_grad_norm": 0.3672289252281189}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 9408.833663397934, "train/update_time": 5204.52180329978, "train/lr": 0.00030829187271113034, "train/loss": 5.5501933097839355, "train/global_grad_norm": 0.4155080318450928}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 9478.914532593917, "train/update_time": 5274.477632154711, "train/lr": 0.0002937368162738445, "train/loss": 5.538198471069336, "train/global_grad_norm": 0.4198897182941437}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 9667.797854268923, "train/update_time": 5344.431819725665, "train/lr": 0.0002793894493783894, "train/loss": 5.504896640777588, "train/global_grad_norm": 0.4164905846118927}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 9737.887706990936, "train/update_time": 5414.394234810607, "train/lr": 0.00026526421860705474, "train/loss": 5.531781196594238, "train/global_grad_norm": 0.35902249813079834}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 9926.87084120093, "train/update_time": 5484.349449058645, "train/lr": 0.0002513753468698824, "train/loss": 5.486477375030518, "train/global_grad_norm": 0.3695092499256134}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 9996.965703879949, "train/update_time": 5554.316489073681, "train/lr": 0.00023773681908340283, "train/loss": 5.501741886138916, "train/global_grad_norm": 0.38276293873786926}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 10185.451445175917, "train/update_time": 5624.270565029001, "train/lr": 0.00022436236808900823, "train/loss": 5.51205587387085, "train/global_grad_norm": 0.34062203764915466}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 10255.549305356923, "train/update_time": 5694.239998900797, "train/lr": 0.00021126546082514682, "train/loss": 5.508440971374512, "train/global_grad_norm": 0.33368465304374695}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 10444.704105362995, "train/update_time": 5764.182670753566, "train/lr": 0.00019845928476725522, "train/loss": 5.496548175811768, "train/global_grad_norm": 0.30512407422065735}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 10514.781046306947, "train/update_time": 5834.130611701286, "train/lr": 0.0001859567346490913, "train/loss": 5.488737106323242, "train/global_grad_norm": 0.3369419574737549}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 10703.836294994922, "train/update_time": 5904.080027965596, "train/lr": 0.00017377039947882782, "train/loss": 5.5062255859375, "train/global_grad_norm": 0.3015817403793335}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 10773.90538297291, "train/update_time": 5974.028263681685, "train/lr": 0.00016191254986299043, "train/loss": 5.500577926635742, "train/global_grad_norm": 0.3552543520927429}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 10963.04556359991, "train/update_time": 6043.978755204822, "train/lr": 0.00015039512565099468, "train/loss": 5.495687007904053, "train/global_grad_norm": 0.3207867443561554}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 11033.109079251997, "train/update_time": 6113.921902889037, "train/lr": 0.00013922972391273224, "train/loss": 5.435054779052734, "train/global_grad_norm": 0.3032216727733612}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 11222.254815177992, "train/update_time": 6183.873032106203, "train/lr": 0.00012842758726130281, "train/loss": 5.514736652374268, "train/global_grad_norm": 0.2982519865036011}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 11292.312823130982, "train/update_time": 6253.814012841205, "train/lr": 0.00011799959253265679, "train/loss": 5.426453113555908, "train/global_grad_norm": 0.2868562340736389}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 11480.794340396998, "train/update_time": 6323.765302975196, "train/lr": 0.00010795623983354214, "train/loss": 5.461309432983398, "train/global_grad_norm": 0.28381142020225525}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 11550.859713669983, "train/update_time": 6393.710524166352, "train/lr": 9.830764196878872e-05, "train/loss": 5.43646240234375, "train/global_grad_norm": 0.2892729640007019}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 11740.033289006911, "train/update_time": 6463.652797408402, "train/lr": 8.906351425856951e-05, "train/loss": 5.4056525230407715, "train/global_grad_norm": 0.2663191556930542}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 11810.107688057935, "train/update_time": 6533.614025922609, "train/lr": 8.02331647558977e-05, "train/loss": 5.44175386428833, "train/global_grad_norm": 0.27445027232170105}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 11998.483695015893, "train/update_time": 6603.55498173763, "train/lr": 7.182548487420554e-05, "train/loss": 5.469080448150635, "train/global_grad_norm": 0.25448915362358093}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 12068.55913664191, "train/update_time": 6673.514490786707, "train/lr": 6.384894043444556e-05, "train/loss": 5.415605545043945, "train/global_grad_norm": 0.2630438804626465}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 12257.111987237004, "train/update_time": 6743.462062919862, "train/lr": 5.6311563140726166e-05, "train/loss": 5.500694751739502, "train/global_grad_norm": 0.2526546120643616}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 12327.182909888914, "train/update_time": 6813.415315568796, "train/lr": 4.922094249306547e-05, "train/loss": 5.468751430511475, "train/global_grad_norm": 0.23347875475883484}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 12515.601697899983, "train/update_time": 6883.366675395635, "train/lr": 4.2584218145409916e-05, "train/loss": 5.40645694732666, "train/global_grad_norm": 0.23998965322971344}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 12585.657294680947, "train/update_time": 6953.310590816545, "train/lr": 3.6408072716606236e-05, "train/loss": 5.436767578125, "train/global_grad_norm": 0.23755612969398499}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 12774.815754264942, "train/update_time": 7023.26507879165, "train/lr": 3.069872506157217e-05, "train/loss": 5.502988338470459, "train/global_grad_norm": 0.22962626814842224}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 12844.891541385907, "train/update_time": 7093.223455801839, "train/lr": 2.5461924009435368e-05, "train/loss": 5.398458957672119, "train/global_grad_norm": 0.23118726909160614}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 13034.968514320906, "train/update_time": 7163.176251192926, "train/lr": 2.0702942574950812e-05, "train/loss": 5.4442338943481445, "train/global_grad_norm": 0.22224336862564087}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 13105.045270575909, "train/update_time": 7233.1264379567, "train/lr": 1.642657264902142e-05, "train/loss": 5.47895622253418, "train/global_grad_norm": 0.21664276719093323}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 13294.375740742893, "train/update_time": 7303.080043341848, "train/lr": 1.2637120173670358e-05, "train/loss": 5.455996036529541, "train/global_grad_norm": 0.20131263136863708}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 13364.448509541922, "train/update_time": 7373.0204275009455, "train/lr": 9.338400806321978e-06, "train/loss": 5.40268611907959, "train/global_grad_norm": 0.20658168196678162}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 13553.81382910593, "train/update_time": 7442.970160528901, "train/lr": 6.533736077758867e-06, "train/loss": 5.440489768981934, "train/global_grad_norm": 0.19446603953838348}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 13623.899586633896, "train/update_time": 7512.917904903879, "train/lr": 4.2259500476214406e-06, "train/loss": 5.4255051612854, "train/global_grad_norm": 0.1936364620923996}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 13812.626531647984, "train/update_time": 7582.864485563943, "train/lr": 2.417366460819359e-06, "train/loss": 5.462730407714844, "train/global_grad_norm": 0.2014123499393463}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 13882.689546601963, "train/update_time": 7652.812096997048, "train/lr": 1.1098064077174619e-06, "train/loss": 5.443710803985596, "train/global_grad_norm": 0.18211401998996735}
metrics/jsonlines/train_eval.jsonl CHANGED
@@ -1,19 +1,19 @@
1
- {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 523.9415633900207, "train_eval/train_update_time": 283.0836244261591, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.44396705981344, "train_eval/perplexity_len_2048": 4646.953208330742, "train_eval/loss_avg_len_1024": 8.445717497379519, "train_eval/perplexity_len_1024": 4655.094533156048, "train_eval/loss_avg_len_512": 8.448021737337111, "train_eval/perplexity_len_512": 4665.833355646153}
2
- {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1044.3968842840404, "train_eval/train_update_time": 562.4800082092406, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.903726465636282, "train_eval/perplexity_len_2048": 995.9792914347637, "train_eval/loss_avg_len_1024": 6.910232449487958, "train_eval/perplexity_len_1024": 1002.4802412277218, "train_eval/loss_avg_len_512": 6.921446407868643, "train_eval/perplexity_len_512": 1013.7852815864962}
3
- {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1685.596672433021, "train_eval/train_update_time": 841.9682118772762, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.346844194442092, "train_eval/perplexity_len_2048": 570.6888811028199, "train_eval/loss_avg_len_1024": 6.355865296722332, "train_eval/perplexity_len_1024": 575.8604152670682, "train_eval/loss_avg_len_512": 6.373962009596871, "train_eval/perplexity_len_512": 586.3764618043518}
4
- {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2205.8133425950073, "train_eval/train_update_time": 1121.4434441442718, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.020796056668696, "train_eval/perplexity_len_2048": 411.906366057056, "train_eval/loss_avg_len_1024": 6.03143457122933, "train_eval/perplexity_len_1024": 416.3118301770113, "train_eval/loss_avg_len_512": 6.050862906577386, "train_eval/perplexity_len_512": 424.47915791229906}
5
- {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2845.8649667550344, "train_eval/train_update_time": 1400.8440078733838, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.778140883194901, "train_eval/perplexity_len_2048": 323.1578434258851, "train_eval/loss_avg_len_1024": 5.788657015446152, "train_eval/perplexity_len_1024": 326.5741457049287, "train_eval/loss_avg_len_512": 5.809641378527304, "train_eval/perplexity_len_512": 333.4995041431057}
6
- {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3365.815582766023, "train_eval/train_update_time": 1680.2926608613343, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.594451416916781, "train_eval/perplexity_len_2048": 268.93007912652223, "train_eval/loss_avg_len_1024": 5.604763740769849, "train_eval/perplexity_len_1024": 271.71772203045913, "train_eval/loss_avg_len_512": 5.627147741099179, "train_eval/perplexity_len_512": 277.8684335620044}
7
- {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4005.8882146670367, "train_eval/train_update_time": 1959.7687862973544, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.446014417338083, "train_eval/perplexity_len_2048": 231.83233521014682, "train_eval/loss_avg_len_1024": 5.45869701535783, "train_eval/perplexity_len_1024": 234.79129551578285, "train_eval/loss_avg_len_512": 5.483045409742335, "train_eval/perplexity_len_512": 240.57825223508894}
8
- {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4526.608443362988, "train_eval/train_update_time": 2239.197950529342, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.335741158013298, "train_eval/perplexity_len_2048": 207.62657589630717, "train_eval/loss_avg_len_1024": 5.3480183006091835, "train_eval/perplexity_len_1024": 210.19134880139666, "train_eval/loss_avg_len_512": 5.372833612357645, "train_eval/perplexity_len_512": 215.47256920235168}
9
- {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5168.203005189018, "train_eval/train_update_time": 2518.7051250302466, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.2465569315206455, "train_eval/perplexity_len_2048": 189.91126400613842, "train_eval/loss_avg_len_1024": 5.2607924398454635, "train_eval/perplexity_len_1024": 192.63408175185688, "train_eval/loss_avg_len_512": 5.287922178969602, "train_eval/perplexity_len_512": 197.93173113045845}
10
- {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5687.226690375013, "train_eval/train_update_time": 2798.1550497164135, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.177761742937019, "train_eval/perplexity_len_2048": 177.2855559320088, "train_eval/loss_avg_len_1024": 5.191431534552903, "train_eval/perplexity_len_1024": 179.72565234595527, "train_eval/loss_avg_len_512": 5.21939211089848, "train_eval/perplexity_len_512": 184.8217987580525}
11
- {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6327.491173934017, "train_eval/train_update_time": 3077.6138937743963, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.119764854961449, "train_eval/perplexity_len_2048": 167.29602616505414, "train_eval/loss_avg_len_1024": 5.130968105256434, "train_eval/perplexity_len_1024": 169.18082363462895, "train_eval/loss_avg_len_512": 5.1566913579488025, "train_eval/perplexity_len_512": 173.5891600118863}
12
- {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6848.3877059380175, "train_eval/train_update_time": 3357.13471879164, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.074158342274458, "train_eval/perplexity_len_2048": 159.83760679341086, "train_eval/loss_avg_len_1024": 5.085221332270266, "train_eval/perplexity_len_1024": 161.6157060504098, "train_eval/loss_avg_len_512": 5.112118527772254, "train_eval/perplexity_len_512": 166.0217041617718}
13
- {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7489.941271601012, "train_eval/train_update_time": 3636.640745670593, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.046691670220553, "train_eval/perplexity_len_2048": 155.50714361657919, "train_eval/loss_avg_len_1024": 5.06205342964935, "train_eval/perplexity_len_1024": 157.91444983555672, "train_eval/loss_avg_len_512": 5.089124586692706, "train_eval/perplexity_len_512": 162.24776601273498}
14
- {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8010.996608729009, "train_eval/train_update_time": 3916.130362140422, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.012758545674861, "train_eval/perplexity_len_2048": 150.3188260910137, "train_eval/loss_avg_len_1024": 5.0276956282409815, "train_eval/perplexity_len_1024": 152.58100391545096, "train_eval/loss_avg_len_512": 5.057494392056979, "train_eval/perplexity_len_512": 157.19615054325925}
15
- {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8654.290324420028, "train_eval/train_update_time": 4195.576348111499, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.991214820690475, "train_eval/perplexity_len_2048": 147.11503337066168, "train_eval/loss_avg_len_1024": 5.007109796925361, "train_eval/perplexity_len_1024": 149.4721065146317, "train_eval/loss_avg_len_512": 5.036278882724801, "train_eval/perplexity_len_512": 153.8962821102358}
16
- {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9174.941196507018, "train_eval/train_update_time": 4474.963137161569, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.975023514186086, "train_eval/perplexity_len_2048": 144.75222883900852, "train_eval/loss_avg_len_1024": 4.991780238335013, "train_eval/perplexity_len_1024": 147.1982383268648, "train_eval/loss_avg_len_512": 5.02088207897912, "train_eval/perplexity_len_512": 151.54491944344505}
17
- {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9814.94877268601, "train_eval/train_update_time": 4754.397899497591, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.9583539266328085, "train_eval/perplexity_len_2048": 142.35926912432882, "train_eval/loss_avg_len_1024": 4.970177483169246, "train_eval/perplexity_len_1024": 144.05245198792426, "train_eval/loss_avg_len_512": 4.999460103386228, "train_eval/perplexity_len_512": 148.33305296699936}
18
- {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10334.590985039016, "train_eval/train_update_time": 5033.849530185806, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.958045211400968, "train_eval/perplexity_len_2048": 142.31532743264023, "train_eval/loss_avg_len_1024": 4.973284861676475, "train_eval/perplexity_len_1024": 144.5007736729874, "train_eval/loss_avg_len_512": 5.002621874920951, "train_eval/perplexity_len_512": 148.80279040143105}
19
- {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10974.142334740027, "train_eval/train_update_time": 5313.31503523147, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.94873155637828, "train_eval/perplexity_len_2048": 140.9960049626137, "train_eval/loss_avg_len_1024": 4.965872568360719, "train_eval/perplexity_len_1024": 143.43365135181708, "train_eval/loss_avg_len_512": 4.994268174605532, "train_eval/perplexity_len_512": 147.56491410680255}
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1454.5109360769857, "train_eval/train_update_time": 899.2642310586525, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.402821731921286, "train_eval/perplexity_len_2048": 4459.632898714833, "train_eval/loss_avg_len_1024": 8.407352303462105, "train_eval/perplexity_len_1024": 4479.883423194555, "train_eval/loss_avg_len_512": 8.407402739301325, "train_eval/perplexity_len_512": 4480.109375572615}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2481.5876440349966, "train_eval/train_update_time": 1497.7818741976516, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.995038702964084, "train_eval/perplexity_len_2048": 1091.2059098186799, "train_eval/loss_avg_len_1024": 7.002110293316655, "train_eval/perplexity_len_1024": 1098.9498196110778, "train_eval/loss_avg_len_512": 7.007181799188256, "train_eval/perplexity_len_512": 1104.5373065844053}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3188.29143246694, "train_eval/train_update_time": 1847.4134167138254, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.550251329710736, "train_eval/perplexity_len_2048": 699.4199367381216, "train_eval/loss_avg_len_1024": 6.5585846149979625, "train_eval/perplexity_len_1024": 705.2727553400579, "train_eval/loss_avg_len_512": 6.569192992824828, "train_eval/perplexity_len_512": 712.7943807832622}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3776.9122951189056, "train_eval/train_update_time": 2197.046115837642, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.272974732764633, "train_eval/perplexity_len_2048": 530.0517974267286, "train_eval/loss_avg_len_1024": 6.284432089452021, "train_eval/perplexity_len_1024": 536.1597134027296, "train_eval/loss_avg_len_512": 6.298539984831004, "train_eval/perplexity_len_512": 543.7774070109691}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4485.399678221904, "train_eval/train_update_time": 2546.7019683559192, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.0823833193690735, "train_eval/perplexity_len_2048": 438.0720170587346, "train_eval/loss_avg_len_1024": 6.09325327728664, "train_eval/perplexity_len_1024": 442.85981589278384, "train_eval/loss_avg_len_512": 6.109832875879656, "train_eval/perplexity_len_512": 450.2634589947275}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5077.3983478549635, "train_eval/train_update_time": 2896.4371623727493, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.950598046767519, "train_eval/perplexity_len_2048": 383.98291014534203, "train_eval/loss_avg_len_1024": 5.961579375055372, "train_eval/perplexity_len_1024": 388.2227896861234, "train_eval/loss_avg_len_512": 5.980496873812372, "train_eval/perplexity_len_512": 395.63690094056886}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5784.816746937926, "train_eval/train_update_time": 3246.116620109882, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.849003767906725, "train_eval/perplexity_len_2048": 346.888626699115, "train_eval/loss_avg_len_1024": 5.863815515086607, "train_eval/perplexity_len_1024": 352.0648934809872, "train_eval/loss_avg_len_512": 5.8864257618402185, "train_eval/perplexity_len_512": 360.1158415743008}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6372.185268521891, "train_eval/train_update_time": 3595.7855843111174, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.767166235738806, "train_eval/perplexity_len_2048": 319.6306900556352, "train_eval/loss_avg_len_1024": 5.7811188802827385, "train_eval/perplexity_len_1024": 324.1216409233059, "train_eval/loss_avg_len_512": 5.805235980615107, "train_eval/perplexity_len_512": 332.03353757775665}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7078.806805928936, "train_eval/train_update_time": 3945.447064547101, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.699684085610935, "train_eval/perplexity_len_2048": 298.77299936686927, "train_eval/loss_avg_len_1024": 5.717875908720817, "train_eval/perplexity_len_1024": 304.2579644118213, "train_eval/loss_avg_len_512": 5.747566951586341, "train_eval/perplexity_len_512": 313.42714840543255}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7665.789839876001, "train_eval/train_update_time": 4295.186003304552, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.646992232510201, "train_eval/perplexity_len_2048": 283.4376678519485, "train_eval/loss_avg_len_1024": 5.66428641970735, "train_eval/perplexity_len_1024": 288.3821238771646, "train_eval/loss_avg_len_512": 5.695631070602831, "train_eval/perplexity_len_512": 297.5645185661324}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8372.430396109005, "train_eval/train_update_time": 4644.909644760657, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.59726160423892, "train_eval/perplexity_len_2048": 269.68688591179045, "train_eval/loss_avg_len_1024": 5.613586209967179, "train_eval/perplexity_len_1024": 274.1255491357345, "train_eval/loss_avg_len_512": 5.643964380473335, "train_eval/perplexity_len_512": 282.5807584840765}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8959.925166409928, "train_eval/train_update_time": 4994.663271273952, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.558144126648113, "train_eval/perplexity_len_2048": 259.3410852175999, "train_eval/loss_avg_len_1024": 5.575471486838069, "train_eval/perplexity_len_1024": 263.87393940131795, "train_eval/loss_avg_len_512": 5.6089897783461495, "train_eval/perplexity_len_512": 272.8684411123516}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9667.797854268923, "train_eval/train_update_time": 5344.431819725665, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.5347687173953455, "train_eval/perplexity_len_2048": 253.3491856010016, "train_eval/loss_avg_len_1024": 5.557624726276845, "train_eval/perplexity_len_1024": 259.20641833769395, "train_eval/loss_avg_len_512": 5.592303494717344, "train_eval/perplexity_len_512": 268.3530581593862}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10255.549305356923, "train_eval/train_update_time": 5694.239998900797, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.504252203528103, "train_eval/perplexity_len_2048": 245.7346274669015, "train_eval/loss_avg_len_1024": 5.52741995374381, "train_eval/perplexity_len_1024": 251.49420656403856, "train_eval/loss_avg_len_512": 5.566598175862454, "train_eval/perplexity_len_512": 261.5428613637092}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10963.04556359991, "train_eval/train_update_time": 6043.978755204822, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.4850125069963545, "train_eval/perplexity_len_2048": 241.0519588151443, "train_eval/loss_avg_len_1024": 5.509102286305788, "train_eval/perplexity_len_1024": 246.92935567481706, "train_eval/loss_avg_len_512": 5.548407077461597, "train_eval/perplexity_len_512": 256.82812258960223}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11550.859713669983, "train_eval/train_update_time": 6393.710524166352, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.46941686344535, "train_eval/perplexity_len_2048": 237.32176140860722, "train_eval/loss_avg_len_1024": 5.495577427410535, "train_eval/perplexity_len_1024": 243.61215389080553, "train_eval/loss_avg_len_512": 5.534824562655558, "train_eval/perplexity_len_512": 253.36333434726268}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12257.111987237004, "train_eval/train_update_time": 6743.462062919862, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.450519698281187, "train_eval/perplexity_len_2048": 232.87916136425875, "train_eval/loss_avg_len_1024": 5.47038564050632, "train_eval/perplexity_len_1024": 237.55178468977783, "train_eval/loss_avg_len_512": 5.510487729436864, "train_eval/perplexity_len_512": 247.27169934858398}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12844.891541385907, "train_eval/train_update_time": 7093.223455801839, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.449968136312364, "train_eval/perplexity_len_2048": 232.7507494923267, "train_eval/loss_avg_len_1024": 5.473881935751124, "train_eval/perplexity_len_1024": 238.3837894846464, "train_eval/loss_avg_len_512": 5.514598703888769, "train_eval/perplexity_len_512": 248.29031931304468}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 13553.81382910593, "train_eval/train_update_time": 7442.970160528901, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.439999618460788, "train_eval/perplexity_len_2048": 230.4420955379297, "train_eval/loss_avg_len_1024": 5.46676403799378, "train_eval/perplexity_len_1024": 236.69302253570154, "train_eval/loss_avg_len_512": 5.506935359557975, "train_eval/perplexity_len_512": 246.39485716788857}
metrics/jsonlines/val.jsonl CHANGED
@@ -1,49 +1,49 @@
1
- {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 115.77270240103826, "val/train_update_time": 115.42706217308296, "val/loss": 8.015673879027368, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.22489377000602, "val/val_tokens_per_second": 340694.8321231854, "val/loss_avg_len_2048": 8.015673879027368, "val/perplexity_len_2048": 3028.0492492157514, "val/loss_avg_len_1024": 8.014426561307907, "val/perplexity_len_1024": 3024.2746642745205, "val/loss_avg_len_512": 8.015268499183655, "val/perplexity_len_512": 3026.8219878546656}
2
- {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 348.0203448670218, "val/train_update_time": 227.1939903421444, "val/loss": 7.432892477142811, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.90724678500555, "val/val_tokens_per_second": 341597.36878490367, "val/loss_avg_len_2048": 7.432892477142811, "val/perplexity_len_2048": 1690.690792134549, "val/loss_avg_len_1024": 7.434750729179383, "val/perplexity_len_1024": 1693.8354426139251, "val/loss_avg_len_512": 7.4413878100395205, "val/perplexity_len_512": 1705.114955523971}
3
- {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 579.9503910699859, "val/train_update_time": 338.96602103614714, "val/loss": 7.108143065983057, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.57100503600668, "val/val_tokens_per_second": 339716.8331454807, "val/loss_avg_len_2048": 7.108143065983057, "val/perplexity_len_2048": 1221.8764941210595, "val/loss_avg_len_1024": 7.111402546131611, "val/perplexity_len_1024": 1225.8656740922706, "val/loss_avg_len_512": 7.121028823280335, "val/perplexity_len_512": 1237.7231770584187}
4
- {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 812.5387418420287, "val/train_update_time": 450.7242119802977, "val/loss": 6.80320226866305, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.84687927697087, "val/val_tokens_per_second": 341769.4331893267, "val/loss_avg_len_2048": 6.80320226866305, "val/perplexity_len_2048": 900.7270483130554, "val/loss_avg_len_1024": 6.8087645598977815, "val/perplexity_len_1024": 905.7511141842256, "val/loss_avg_len_512": 6.8227169809758665, "val/perplexity_len_512": 918.4771078858294}
5
- {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1044.3968842840404, "val/train_update_time": 562.4800082092406, "val/loss": 6.553184749881178, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.57042892603204, "val/val_tokens_per_second": 339718.45638144226, "val/loss_avg_len_2048": 6.553184749881178, "val/perplexity_len_2048": 701.4746414709258, "val/loss_avg_len_1024": 6.560025909277051, "val/perplexity_len_1024": 706.2899938214338, "val/loss_avg_len_512": 6.576782421255111, "val/perplexity_len_512": 718.2246630258743}
6
- {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1277.4677367979893, "val/train_update_time": 674.2627646423061, "val/loss": 6.368516060034558, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.46988375496585, "val/val_tokens_per_second": 342847.9103906173, "val/loss_avg_len_2048": 6.368516060034558, "val/perplexity_len_2048": 583.1917648892949, "val/loss_avg_len_1024": 6.376286262953282, "val/perplexity_len_1024": 587.7409343394978, "val/loss_avg_len_512": 6.39470073364526, "val/perplexity_len_512": 598.6641363781473}
7
- {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1509.0055129660177, "val/train_update_time": 786.0731882582186, "val/loss": 6.228333079337515, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.57018486200832, "val/val_tokens_per_second": 339719.1440560402, "val/loss_avg_len_2048": 6.228333079337515, "val/perplexity_len_2048": 506.9098004279741, "val/loss_avg_len_1024": 6.2365625457014895, "val/perplexity_len_1024": 511.0986097727111, "val/loss_avg_len_512": 6.255868876511604, "val/perplexity_len_512": 521.0619165878901}
8
- {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1741.6182956020348, "val/train_update_time": 897.8648126212647, "val/loss": 6.0991746725922455, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.50438052997924, "val/val_tokens_per_second": 339904.6559125701, "val/loss_avg_len_2048": 6.0991746725922455, "val/perplexity_len_2048": 445.489943254568, "val/loss_avg_len_1024": 6.108094833559171, "val/perplexity_len_1024": 449.4815617297244, "val/loss_avg_len_512": 6.1286831315368415, "val/perplexity_len_512": 458.83154187819093}
9
- {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1974.1614223060315, "val/train_update_time": 1009.6508660353138, "val/loss": 5.986035494066402, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.61198412301019, "val/val_tokens_per_second": 342440.60325825145, "val/loss_avg_len_2048": 5.986035494066402, "val/perplexity_len_2048": 397.8342630532667, "val/loss_avg_len_1024": 5.995515969962067, "val/perplexity_len_1024": 401.62385638490093, "val/loss_avg_len_512": 6.017069597534835, "val/perplexity_len_512": 410.37427023615345}
10
- {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2205.8133425950073, "val/train_update_time": 1121.4434441442718, "val/loss": 5.883162301799842, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.17791116697481, "val/val_tokens_per_second": 343687.84952618263, "val/loss_avg_len_2048": 5.883162301799842, "val/perplexity_len_2048": 358.9425334787863, "val/loss_avg_len_1024": 5.893183481512219, "val/perplexity_len_1024": 362.5576446991801, "val/loss_avg_len_512": 5.915859284684807, "val/perplexity_len_512": 370.8728511838533}
11
- {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2437.459500588011, "val/train_update_time": 1233.201419278339, "val/loss": 5.795106819085591, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.89575185399735, "val/val_tokens_per_second": 341630.119221229, "val/loss_avg_len_2048": 5.795106819085591, "val/perplexity_len_2048": 328.6872921828014, "val/loss_avg_len_1024": 5.80581705780169, "val/perplexity_len_1024": 332.2265307581603, "val/loss_avg_len_512": 5.8295803298715505, "val/perplexity_len_512": 340.21587066949}
12
- {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2669.367173767998, "val/train_update_time": 1344.9662226063083, "val/loss": 5.713464564111806, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.50630050798645, "val/val_tokens_per_second": 339899.2403495567, "val/loss_avg_len_2048": 5.713464564111806, "val/perplexity_len_2048": 302.9187337565111, "val/loss_avg_len_1024": 5.72461780461669, "val/perplexity_len_1024": 306.3161702421054, "val/loss_avg_len_512": 5.748771319710836, "val/perplexity_len_512": 313.8048574770508}
13
- {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2901.8806591850007, "val/train_update_time": 1456.7321346284007, "val/loss": 5.634124744190066, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.06040904897964, "val/val_tokens_per_second": 341161.5896068622, "val/loss_avg_len_2048": 5.634124744190066, "val/perplexity_len_2048": 279.8139013573606, "val/loss_avg_len_1024": 5.645964613835513, "val/perplexity_len_1024": 283.1465516152128, "val/loss_avg_len_512": 5.671252074276097, "val/perplexity_len_512": 290.3979066084578}
14
- {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3133.951200322015, "val/train_update_time": 1568.5041498313076, "val/loss": 5.5654044165277625, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.84437395603163, "val/val_tokens_per_second": 341776.5778060417, "val/loss_avg_len_2048": 5.5654044165277625, "val/perplexity_len_2048": 261.23082841471256, "val/loss_avg_len_1024": 5.577634366972465, "val/perplexity_len_1024": 264.4452847558606, "val/loss_avg_len_512": 5.603337141299248, "val/perplexity_len_512": 271.33036603867896}
15
- {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3365.815582766023, "val/train_update_time": 1680.2926608613343, "val/loss": 5.505503168662952, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.83285554096801, "val/val_tokens_per_second": 341809.42960169003, "val/loss_avg_len_2048": 5.505503168662952, "val/perplexity_len_2048": 246.04222527524095, "val/loss_avg_len_1024": 5.518010230046278, "val/perplexity_len_1024": 249.13881474127635, "val/loss_avg_len_512": 5.544472198890244, "val/perplexity_len_512": 255.81952077770256}
16
- {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3598.114992738003, "val/train_update_time": 1792.082390839234, "val/loss": 5.451877971532068, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.85984571097651, "val/val_tokens_per_second": 341732.4605837447, "val/loss_avg_len_2048": 5.451877971532068, "val/perplexity_len_2048": 233.1956898172289, "val/loss_avg_len_1024": 5.464605613744981, "val/perplexity_len_1024": 236.182689532032, "val/loss_avg_len_512": 5.491216098095244, "val/perplexity_len_512": 242.55199459484575}
17
- {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3829.9891319620074, "val/train_update_time": 1903.884795576334, "val/loss": 5.408561823876447, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.89821732096607, "val/val_tokens_per_second": 341623.09428129846, "val/loss_avg_len_2048": 5.408561823876447, "val/perplexity_len_2048": 223.31019723410046, "val/loss_avg_len_1024": 5.421516989876121, "val/perplexity_len_1024": 226.22203887867025, "val/loss_avg_len_512": 5.448322415597644, "val/perplexity_len_512": 232.368021780364}
18
- {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4061.88242925104, "val/train_update_time": 2015.6589334552991, "val/loss": 5.360722617470438, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.4022120119771, "val/val_tokens_per_second": 340193.0854553193, "val/loss_avg_len_2048": 5.360722617470438, "val/perplexity_len_2048": 212.87872076556144, "val/loss_avg_len_1024": 5.374010867381096, "val/perplexity_len_1024": 215.7263847403555, "val/loss_avg_len_512": 5.401363052465115, "val/perplexity_len_512": 221.70841053588902}
19
- {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4294.288298399013, "val/train_update_time": 2127.428580871492, "val/loss": 5.324311214937968, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.29681166395312, "val/val_tokens_per_second": 340491.1521214792, "val/loss_avg_len_2048": 5.324311214937968, "val/perplexity_len_2048": 205.26692696870975, "val/loss_avg_len_1024": 5.337608720328915, "val/perplexity_len_1024": 208.01469376952264, "val/loss_avg_len_512": 5.36512916255719, "val/perplexity_len_512": 213.81885028692912}
20
- {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4526.608443362988, "val/train_update_time": 2239.197950529342, "val/loss": 5.286909812269924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.71425316296518, "val/val_tokens_per_second": 339313.7009654004, "val/loss_avg_len_2048": 5.286909812269924, "val/perplexity_len_2048": 197.73145303159654, "val/loss_avg_len_1024": 5.3004279320558885, "val/perplexity_len_1024": 200.42255886379493, "val/loss_avg_len_512": 5.3283431476617, "val/perplexity_len_512": 206.0962201122877}
21
- {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4759.794200062985, "val/train_update_time": 2350.9869147563586, "val/loss": 5.25467645336233, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.73026912601199, "val/val_tokens_per_second": 339268.6879315085, "val/loss_avg_len_2048": 5.25467645336233, "val/perplexity_len_2048": 191.45952974497322, "val/loss_avg_len_1024": 5.268592918162723, "val/perplexity_len_1024": 194.14259564559134, "val/loss_avg_len_512": 5.296714650505549, "val/perplexity_len_512": 199.67971352419883}
22
- {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4992.560281209007, "val/train_update_time": 2462.802181679348, "val/loss": 5.223173501159495, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.62471787503455, "val/val_tokens_per_second": 342404.1513125129, "val/loss_avg_len_2048": 5.223173501159495, "val/perplexity_len_2048": 185.5220051503669, "val/loss_avg_len_1024": 5.237343601963297, "val/perplexity_len_1024": 188.16958459763552, "val/loss_avg_len_512": 5.265995957652177, "val/perplexity_len_512": 193.63906909356123}
23
- {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5224.203345166985, "val/train_update_time": 2574.5941846003407, "val/loss": 5.195035318465339, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.14392835099716, "val/val_tokens_per_second": 343785.8778613723, "val/loss_avg_len_2048": 5.195035318465339, "val/perplexity_len_2048": 180.37451323550582, "val/loss_avg_len_1024": 5.209246023547824, "val/perplexity_len_1024": 182.95606161492964, "val/loss_avg_len_512": 5.237928895593109, "val/perplexity_len_512": 188.27975129361263}
24
- {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5455.381416677032, "val/train_update_time": 2686.3787583513767, "val/loss": 5.172354661972123, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.80233659001533, "val/val_tokens_per_second": 341896.50357298396, "val/loss_avg_len_2048": 5.172354661972123, "val/perplexity_len_2048": 176.32954551923112, "val/loss_avg_len_1024": 5.186509269822098, "val/perplexity_len_1024": 178.84316879680094, "val/loss_avg_len_512": 5.215404340788373, "val/perplexity_len_512": 184.08623950862088}
25
- {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5687.226690375013, "val/train_update_time": 2798.1550497164135, "val/loss": 5.147608817131834, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.42907125299098, "val/val_tokens_per_second": 342965.0718226966, "val/loss_avg_len_2048": 5.147608817131834, "val/perplexity_len_2048": 172.01966765914332, "val/loss_avg_len_1024": 5.162196564418997, "val/perplexity_len_1024": 174.547439521781, "val/loss_avg_len_512": 5.191514030500082, "val/perplexity_len_512": 179.74047959546354}
26
- {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5919.139419794024, "val/train_update_time": 2909.933540413331, "val/loss": 5.129140133706461, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.83275818795664, "val/val_tokens_per_second": 341809.7072901768, "val/loss_avg_len_2048": 5.129140133706461, "val/perplexity_len_2048": 168.87184838724093, "val/loss_avg_len_1024": 5.143809148180205, "val/perplexity_len_1024": 171.3672900632081, "val/loss_avg_len_512": 5.173024764407822, "val/perplexity_len_512": 176.447743975283}
27
- {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6151.005747379037, "val/train_update_time": 3021.723491590412, "val/loss": 5.1072978981880475, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.4659342149971, "val/val_tokens_per_second": 340013.1353889487, "val/loss_avg_len_2048": 5.1072978981880475, "val/perplexity_len_2048": 165.22330097222542, "val/loss_avg_len_1024": 5.122188884299453, "val/perplexity_len_1024": 167.70204854687952, "val/loss_avg_len_512": 5.15172206159496, "val/perplexity_len_512": 172.72868378337427}
28
- {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6383.517213010986, "val/train_update_time": 3133.5136128164013, "val/loss": 5.090300521257392, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.48005827597808, "val/val_tokens_per_second": 339973.27513051854, "val/loss_avg_len_2048": 5.090300521257392, "val/perplexity_len_2048": 162.43867099271574, "val/loss_avg_len_1024": 5.105379784501816, "val/perplexity_len_1024": 164.90668764459724, "val/loss_avg_len_512": 5.135113706233085, "val/perplexity_len_512": 169.88363560477018}
29
- {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6616.055601358006, "val/train_update_time": 3245.322524867661, "val/loss": 5.074609349017905, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.29630511597497, "val/val_tokens_per_second": 340492.58587378374, "val/loss_avg_len_2048": 5.074609349017905, "val/perplexity_len_2048": 159.90971089042583, "val/loss_avg_len_1024": 5.089726012247521, "val/perplexity_len_1024": 162.34537531484884, "val/loss_avg_len_512": 5.119521015206934, "val/perplexity_len_512": 167.2552377162275}
30
- {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6848.3877059380175, "val/train_update_time": 3357.13471879164, "val/loss": 5.057983288034002, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.72075532499002, "val/val_tokens_per_second": 339295.4251299403, "val/loss_avg_len_2048": 5.057983288034002, "val/perplexity_len_2048": 157.27302189838034, "val/loss_avg_len_1024": 5.073276809682278, "val/perplexity_len_1024": 159.69676682020017, "val/loss_avg_len_512": 5.1034254083821775, "val/perplexity_len_512": 164.5847126848665}
31
- {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 7081.627107631008, "val/train_update_time": 3468.931253584684, "val/loss": 5.044277466451659, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.07575890101725, "val/val_tokens_per_second": 341117.97730768286, "val/loss_avg_len_2048": 5.044277466451659, "val/perplexity_len_2048": 155.13217049725282, "val/loss_avg_len_1024": 5.059701495861052, "val/perplexity_len_1024": 157.54348192231643, "val/loss_avg_len_512": 5.089907858864603, "val/perplexity_len_512": 162.37489995654545}
32
- {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7313.765329002985, "val/train_update_time": 3580.7367782525835, "val/loss": 5.0319846531080845, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.13104758301051, "val/val_tokens_per_second": 340960.9823946358, "val/loss_avg_len_2048": 5.0319846531080845, "val/perplexity_len_2048": 153.23683306388372, "val/loss_avg_len_1024": 5.047668003283535, "val/perplexity_len_1024": 155.6590445232944, "val/loss_avg_len_512": 5.078129063933541, "val/perplexity_len_512": 160.4735391603105}
33
- {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7545.990838972037, "val/train_update_time": 3692.5671064984635, "val/loss": 5.020685842652529, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.5224018379813, "val/val_tokens_per_second": 339853.8311164979, "val/loss_avg_len_2048": 5.020685842652529, "val/perplexity_len_2048": 151.5151837428477, "val/loss_avg_len_1024": 5.03638942723212, "val/perplexity_len_1024": 153.91329543926756, "val/loss_avg_len_512": 5.066985254941543, "val/perplexity_len_512": 158.69517994303604}
34
- {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7778.552717441984, "val/train_update_time": 3804.3707672305172, "val/loss": 5.009975375909938, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.43973159999587, "val/val_tokens_per_second": 340087.10793242423, "val/loss_avg_len_2048": 5.009975375909938, "val/perplexity_len_2048": 149.90104492676969, "val/loss_avg_len_1024": 5.025792396103987, "val/perplexity_len_1024": 152.29088301658857, "val/loss_avg_len_512": 5.056605560239509, "val/perplexity_len_512": 157.0564916788882}
35
- {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 8010.996608729009, "val/train_update_time": 3916.130362140422, "val/loss": 5.000331946871419, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.7698862789548, "val/val_tokens_per_second": 339157.3947945137, "val/loss_avg_len_2048": 5.000331946871419, "val/perplexity_len_2048": 148.4624325640513, "val/loss_avg_len_1024": 5.016125007033599, "val/perplexity_len_1024": 150.82572135347417, "val/loss_avg_len_512": 5.046963756926056, "val/perplexity_len_512": 155.5494607996784}
36
- {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8244.260649731033, "val/train_update_time": 4027.906490651425, "val/loss": 4.992123881059772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 121.04618034599116, "val/val_tokens_per_second": 338383.2507801765, "val/loss_avg_len_2048": 4.992123881059772, "val/perplexity_len_2048": 147.2488306229025, "val/loss_avg_len_1024": 5.008033923187299, "val/perplexity_len_1024": 149.61030145864262, "val/loss_avg_len_512": 5.038996019480436, "val/perplexity_len_512": 154.31500796500313}
37
- {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8477.364581940987, "val/train_update_time": 4139.6967294025235, "val/loss": 4.9849577407377925, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.92127310601063, "val/val_tokens_per_second": 338732.78826704645, "val/loss_avg_len_2048": 4.9849577407377925, "val/perplexity_len_2048": 146.1973967014911, "val/loss_avg_len_1024": 5.000919279863674, "val/perplexity_len_1024": 148.54965506062427, "val/loss_avg_len_512": 5.031968218916911, "val/perplexity_len_512": 153.23431476116755}
38
- {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8710.28124093404, "val/train_update_time": 4251.451832082472, "val/loss": 4.978439289365397, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.10382062901044, "val/val_tokens_per_second": 341038.27659672574, "val/loss_avg_len_2048": 4.978439289365397, "val/perplexity_len_2048": 145.24751532149992, "val/loss_avg_len_1024": 4.994448879486923, "val/perplexity_len_1024": 147.5915822165599, "val/loss_avg_len_512": 5.025531688477309, "val/perplexity_len_512": 152.25118479695178}
39
- {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 8942.332405220019, "val/train_update_time": 4363.189962119563, "val/loss": 4.972828367076373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.59670256101526, "val/val_tokens_per_second": 339644.44408649154, "val/loss_avg_len_2048": 4.972828367076373, "val/perplexity_len_2048": 144.43482490386887, "val/loss_avg_len_1024": 4.988867791219574, "val/perplexity_len_1024": 146.7701549299752, "val/loss_avg_len_512": 5.020086908638769, "val/perplexity_len_512": 151.4244633161895}
40
- {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9174.941196507018, "val/train_update_time": 4474.963137161569, "val/loss": 4.968042541342369, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.18285846401704, "val/val_tokens_per_second": 340813.99397122423, "val/loss_avg_len_2048": 4.968042541342369, "val/perplexity_len_2048": 143.74523644325927, "val/loss_avg_len_1024": 4.984183913346676, "val/perplexity_len_1024": 146.08430891227124, "val/loss_avg_len_512": 5.015469340964639, "val/perplexity_len_512": 150.7268624584094}
41
- {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9407.59549859399, "val/train_update_time": 4586.732141316519, "val/loss": 4.963817747580077, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.84763080696575, "val/val_tokens_per_second": 341767.29005158885, "val/loss_avg_len_2048": 4.963817747580077, "val/perplexity_len_2048": 143.1392235062003, "val/loss_avg_len_1024": 4.98003164209972, "val/perplexity_len_1024": 145.4789848413279, "val/loss_avg_len_512": 5.01136137723279, "val/perplexity_len_512": 150.10895201980566}
42
- {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9639.468442777987, "val/train_update_time": 4698.514196849486, "val/loss": 4.960097752921781, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.45295924000675, "val/val_tokens_per_second": 342896.4862871461, "val/loss_avg_len_2048": 4.960097752921781, "val/perplexity_len_2048": 142.6077355384751, "val/loss_avg_len_1024": 4.976341741883511, "val/perplexity_len_1024": 144.94317106134073, "val/loss_avg_len_512": 5.007765855770442, "val/perplexity_len_512": 149.57020118662368}
43
- {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 9870.947561467998, "val/train_update_time": 4810.2732713416335, "val/loss": 4.957366871821497, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 120.02587283303728, "val/val_tokens_per_second": 341259.75536106, "val/loss_avg_len_2048": 4.957366871821497, "val/perplexity_len_2048": 142.21882204868862, "val/loss_avg_len_1024": 4.973634772009083, "val/perplexity_len_1024": 144.5513448339354, "val/loss_avg_len_512": 5.005063210593537, "val/perplexity_len_512": 149.16651176413532}
44
- {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 10103.00234679901, "val/train_update_time": 4922.056656240777, "val/loss": 4.955103045992324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.5372970669996, "val/val_tokens_per_second": 342654.560584905, "val/loss_avg_len_2048": 4.955103045992324, "val/perplexity_len_2048": 141.89722756024008, "val/loss_avg_len_1024": 4.971366092304205, "val/perplexity_len_1024": 144.22377584665176, "val/loss_avg_len_512": 5.0028089391355755, "val/perplexity_len_512": 148.8306286822433}
45
- {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10334.590985039016, "val/train_update_time": 5033.849530185806, "val/loss": 4.953492992541195, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.82422458700603, "val/val_tokens_per_second": 341834.05017787847, "val/loss_avg_len_2048": 4.953492992541195, "val/perplexity_len_2048": 141.66894925874792, "val/loss_avg_len_1024": 4.969803081032343, "val/perplexity_len_1024": 143.99852853723146, "val/loss_avg_len_512": 5.00130308859892, "val/perplexity_len_512": 148.60668065861563}
46
- {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10566.927748315036, "val/train_update_time": 5145.645859024662, "val/loss": 4.952207647149178, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.30899883102393, "val/val_tokens_per_second": 343310.23142697907, "val/loss_avg_len_2048": 4.952207647149178, "val/perplexity_len_2048": 141.4869727040432, "val/loss_avg_len_1024": 4.968534963625809, "val/perplexity_len_1024": 143.81603723133458, "val/loss_avg_len_512": 5.000053355490358, "val/perplexity_len_512": 148.42107797071182}
47
- {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 10798.264622143994, "val/train_update_time": 5257.438326367526, "val/loss": 4.951416557332166, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.87131762300851, "val/val_tokens_per_second": 341699.7561403129, "val/loss_avg_len_2048": 4.951416557332166, "val/perplexity_len_2048": 141.3750880619327, "val/loss_avg_len_1024": 4.967739155666985, "val/perplexity_len_1024": 143.7016328123107, "val/loss_avg_len_512": 4.999264901770174, "val/perplexity_len_512": 148.30410094117912}
48
- {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 11030.159157333022, "val/train_update_time": 5369.213506219559, "val/loss": 4.950958369477766, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.70726062700851, "val/val_tokens_per_second": 342168.0505046872, "val/loss_avg_len_2048": 4.950958369477766, "val/perplexity_len_2048": 141.3103265512697, "val/loss_avg_len_1024": 4.967282502107741, "val/perplexity_len_1024": 143.6360259311658, "val/loss_avg_len_512": 4.998807998296409, "val/perplexity_len_512": 148.23635575996838}
49
- {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 11261.909377036034, "val/train_update_time": 5481.002296196413, "val/loss": 4.950755261772388, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.94873680203455, "val/val_tokens_per_second": 341479.210969942, "val/loss_avg_len_2048": 4.950755261772388, "val/perplexity_len_2048": 141.28162824961944, "val/loss_avg_len_1024": 4.967067979426496, "val/perplexity_len_1024": 143.60521605058682, "val/loss_avg_len_512": 4.998585561082139, "val/perplexity_len_512": 148.20338614491018}
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 371.2564815880032, "val/train_update_time": 370.74722005974036, "val/loss": 8.074574615192414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 268.5667409999296, "val/val_tokens_per_second": 152513.30022286988, "val/loss_avg_len_2048": 8.074574615192414, "val/perplexity_len_2048": 3211.7608437878293, "val/loss_avg_len_1024": 8.077264636135101, "val/perplexity_len_1024": 3220.41217863958, "val/loss_avg_len_512": 8.07784064731598, "val/perplexity_len_512": 3222.2677064125833}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 989.879293700913, "val/train_update_time": 720.4508380297339, "val/loss": 7.447038386821747, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 285.6286050810013, "val/val_tokens_per_second": 143403.00401069483, "val/loss_avg_len_2048": 7.447038386821747, "val/perplexity_len_2048": 1714.7771111702812, "val/loss_avg_len_1024": 7.450830672264099, "val/perplexity_len_1024": 1721.2923815200618, "val/loss_avg_len_512": 7.45331780166626, "val/perplexity_len_512": 1725.5787866243627}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 1606.152420823928, "val/train_update_time": 1050.7343851425685, "val/loss": 7.138498369634151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 304.81948690803256, "val/val_tokens_per_second": 134374.61107057793, "val/loss_avg_len_2048": 7.138498369634151, "val/perplexity_len_2048": 1259.5356114067852, "val/loss_avg_len_1024": 7.143596195149422, "val/perplexity_len_1024": 1265.9728983258412, "val/loss_avg_len_512": 7.148876025867462, "val/perplexity_len_512": 1272.6746975366846}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 2218.4671919340035, "val/train_update_time": 1357.900893958984, "val/loss": 6.91851605821252, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 122.97149007802363, "val/val_tokens_per_second": 333085.3352595099, "val/loss_avg_len_2048": 6.91851605821252, "val/perplexity_len_2048": 1010.8188846480316, "val/loss_avg_len_1024": 6.924789595830441, "val/perplexity_len_1024": 1017.1802281458952, "val/loss_avg_len_512": 6.9318418387413026, "val/perplexity_len_512": 1024.378984051259}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 2481.5876440349966, "val/train_update_time": 1497.7818741976516, "val/loss": 6.731239570814371, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.63907289097551, "val/val_tokens_per_second": 345248.82066164305, "val/loss_avg_len_2048": 6.731239570814371, "val/perplexity_len_2048": 838.1856126915511, "val/loss_avg_len_1024": 6.738856630378962, "val/perplexity_len_1024": 844.5944998806121, "val/loss_avg_len_512": 6.748001024723053, "val/perplexity_len_512": 852.3532254060005}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 2740.80689570494, "val/train_update_time": 1637.629011878511, "val/loss": 6.573862080945075, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.79705565003678, "val/val_tokens_per_second": 344789.6900800615, "val/loss_avg_len_2048": 6.573862080945075, "val/perplexity_len_2048": 716.1302622607554, "val/loss_avg_len_1024": 6.582782233029604, "val/perplexity_len_1024": 722.5468289411266, "val/loss_avg_len_512": 6.593902683746815, "val/perplexity_len_512": 730.626718081672}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 2999.716264599003, "val/train_update_time": 1777.492755148909, "val/loss": 6.449963170717657, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.52784067601897, "val/val_tokens_per_second": 345572.8187266909, "val/loss_avg_len_2048": 6.449963170717657, "val/perplexity_len_2048": 632.6789912699647, "val/loss_avg_len_1024": 6.4598856550842525, "val/perplexity_len_1024": 638.9879873645075, "val/loss_avg_len_512": 6.472617136281729, "val/perplexity_len_512": 647.1752583647682}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 3258.342541063903, "val/train_update_time": 1917.3395090608392, "val/loss": 6.350446031192691, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.16394796001259, "val/val_tokens_per_second": 343728.1216441805, "val/loss_avg_len_2048": 6.350446031192691, "val/perplexity_len_2048": 572.7481155748915, "val/loss_avg_len_1024": 6.361272199109942, "val/perplexity_len_1024": 578.982469035299, "val/loss_avg_len_512": 6.375198934513331, "val/perplexity_len_512": 587.1022142184726}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 3517.6349648769246, "val/train_update_time": 2057.1937958986964, "val/loss": 6.245061218327843, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.16180873999838, "val/val_tokens_per_second": 343734.2923299484, "val/loss_avg_len_2048": 6.245061218327843, "val/perplexity_len_2048": 515.460779607975, "val/loss_avg_len_1024": 6.256822197538241, "val/perplexity_len_1024": 521.5588927204132, "val/loss_avg_len_512": 6.2725997325658795, "val/perplexity_len_512": 529.8530651619924}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 3776.9122951189056, "val/train_update_time": 2197.046115837642, "val/loss": 6.165311972237379, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.31773394800257, "val/val_tokens_per_second": 343285.09807142284, "val/loss_avg_len_2048": 6.165311972237379, "val/perplexity_len_2048": 475.9496028389593, "val/loss_avg_len_1024": 6.1779335115781056, "val/perplexity_len_1024": 481.99488963534884, "val/loss_avg_len_512": 6.19536230119653, "val/perplexity_len_512": 490.4691103574319}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 4036.799753597006, "val/train_update_time": 2336.9093162972713, "val/loss": 6.09605717837629, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.27474389097188, "val/val_tokens_per_second": 343408.8279195235, "val/loss_avg_len_2048": 6.09605717837629, "val/perplexity_len_2048": 444.10329349255574, "val/loss_avg_len_1024": 6.1096146122057, "val/perplexity_len_1024": 450.1651935621965, "val/loss_avg_len_512": 6.128595473396592, "val/perplexity_len_512": 458.7913233213098}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 4296.169194732909, "val/train_update_time": 2476.7631880298723, "val/loss": 6.035459959003074, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.14929955196567, "val/val_tokens_per_second": 343770.3801366935, "val/loss_avg_len_2048": 6.035459959003074, "val/perplexity_len_2048": 417.99102416430844, "val/loss_avg_len_1024": 6.0497336251823235, "val/perplexity_len_1024": 424.00007205896117, "val/loss_avg_len_512": 6.069915484577604, "val/perplexity_len_512": 432.6441149291548}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 4555.462784307892, "val/train_update_time": 2616.633614034741, "val/loss": 5.978946462819375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 121.9430886899354, "val/val_tokens_per_second": 335894.39500051504, "val/loss_avg_len_2048": 5.978946462819375, "val/perplexity_len_2048": 395.02397640536174, "val/loss_avg_len_1024": 5.994191604967321, "val/perplexity_len_1024": 401.0923118656187, "val/loss_avg_len_512": 6.015902790261898, "val/perplexity_len_512": 409.8957217942098}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 4817.594658994931, "val/train_update_time": 2756.5458030648297, "val/loss": 5.9318083871576475, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.64603293093387, "val/val_tokens_per_second": 342343.15168346884, "val/loss_avg_len_2048": 5.9318083871576475, "val/perplexity_len_2048": 376.8353622500592, "val/loss_avg_len_1024": 5.947726577504678, "val/perplexity_len_1024": 382.88189654128325, "val/loss_avg_len_512": 5.970375245298259, "val/perplexity_len_512": 391.6526089792945}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 5077.3983478549635, "val/train_update_time": 2896.4371623727493, "val/loss": 5.8920511382135325, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.62661699997261, "val/val_tokens_per_second": 345285.0720678434, "val/loss_avg_len_2048": 5.8920511382135325, "val/perplexity_len_2048": 362.1473373280599, "val/loss_avg_len_1024": 5.908480526011203, "val/perplexity_len_1024": 368.1463414158124, "val/loss_avg_len_512": 5.932198676339816, "val/perplexity_len_512": 376.98246571998305}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 5336.696560251992, "val/train_update_time": 3036.3103124498157, "val/loss": 5.855954391597583, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.72400588693563, "val/val_tokens_per_second": 345001.83593036287, "val/loss_avg_len_2048": 5.855954391597583, "val/perplexity_len_2048": 349.30811775115666, "val/loss_avg_len_1024": 5.873065555681661, "val/perplexity_len_1024": 355.3366164948888, "val/loss_avg_len_512": 5.897885296325106, "val/perplexity_len_512": 364.26633743411116}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 5595.524489760981, "val/train_update_time": 3176.177008557832, "val/loss": 5.820256409952801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.2323824360501, "val/val_tokens_per_second": 343530.83586137986, "val/loss_avg_len_2048": 5.820256409952801, "val/perplexity_len_2048": 337.0584676966778, "val/loss_avg_len_1024": 5.837889617230394, "val/perplexity_len_1024": 343.0545996773094, "val/loss_avg_len_512": 5.863783256886014, "val/perplexity_len_512": 352.05353668420764}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 5854.861399552901, "val/train_update_time": 3316.0491548541468, "val/loss": 5.786057632419194, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.49763245903887, "val/val_tokens_per_second": 345660.91448416625, "val/loss_avg_len_2048": 5.786057632419194, "val/perplexity_len_2048": 325.72635675500464, "val/loss_avg_len_1024": 5.804369036847027, "val/perplexity_len_1024": 331.74580791230153, "val/loss_avg_len_512": 5.831280016766, "val/perplexity_len_512": 340.794622835555}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 6113.463035934954, "val/train_update_time": 3455.9149778021965, "val/loss": 5.759885396619764, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.62306807097048, "val/val_tokens_per_second": 345295.40220199176, "val/loss_avg_len_2048": 5.759885396619764, "val/perplexity_len_2048": 317.3119618105809, "val/loss_avg_len_1024": 5.778556486419566, "val/perplexity_len_1024": 323.29217678044284, "val/loss_avg_len_512": 5.806458696108102, "val/perplexity_len_512": 332.43976843011467}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 6372.185268521891, "val/train_update_time": 3595.7855843111174, "val/loss": 5.7300529724414755, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.57840889098588, "val/val_tokens_per_second": 345425.4478794386, "val/loss_avg_len_2048": 5.7300529724414755, "val/perplexity_len_2048": 307.98558269355703, "val/loss_avg_len_1024": 5.7494624780176675, "val/perplexity_len_1024": 314.0218212805507, "val/loss_avg_len_512": 5.778485274661239, "val/perplexity_len_512": 323.2691553957875}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 6631.451806944911, "val/train_update_time": 3735.644923887099, "val/loss": 5.7072238631833025, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.90358249703422, "val/val_tokens_per_second": 344480.78972743865, "val/loss_avg_len_2048": 5.7072238631833025, "val/perplexity_len_2048": 301.03419507097897, "val/loss_avg_len_1024": 5.727206208104669, "val/perplexity_len_1024": 307.1100671050411, "val/loss_avg_len_512": 5.757021965130791, "val/perplexity_len_512": 316.40466036500106}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 6890.435664905934, "val/train_update_time": 3875.508329823031, "val/loss": 5.683767916635869, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.32042055693455, "val/val_tokens_per_second": 346178.6207926, "val/loss_avg_len_2048": 5.683767916635869, "val/perplexity_len_2048": 294.0553210975227, "val/loss_avg_len_1024": 5.704273862084427, "val/perplexity_len_1024": 300.1474524536417, "val/loss_avg_len_512": 5.735127307290689, "val/perplexity_len_512": 309.5523765252151}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 7148.850932181929, "val/train_update_time": 4015.3776673960965, "val/loss": 5.66337805530862, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.15536998491734, "val/val_tokens_per_second": 346662.1957616365, "val/loss_avg_len_2048": 5.66337805530862, "val/perplexity_len_2048": 288.12028676236497, "val/loss_avg_len_1024": 5.684271093870421, "val/perplexity_len_1024": 294.2033202725829, "val/loss_avg_len_512": 5.7158169909281895, "val/perplexity_len_512": 303.6321667294071}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 7407.169766063918, "val/train_update_time": 4155.28995017719, "val/loss": 5.643267921690471, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.50208243296947, "val/val_tokens_per_second": 345647.93427296064, "val/loss_avg_len_2048": 5.643267921690471, "val/perplexity_len_2048": 282.3840211506362, "val/loss_avg_len_1024": 5.664606115816004, "val/perplexity_len_1024": 288.47433325867826, "val/loss_avg_len_512": 5.697027105129534, "val/perplexity_len_512": 297.9802190065426}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 7665.789839876001, "val/train_update_time": 4295.186003304552, "val/loss": 5.622699441415019, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.53706486790907, "val/val_tokens_per_second": 345545.927306733, "val/loss_avg_len_2048": 5.622699441415019, "val/perplexity_len_2048": 276.63513656770886, "val/loss_avg_len_1024": 5.6445637688343355, "val/perplexity_len_1024": 282.7501848727848, "val/loss_avg_len_512": 5.677907623612683, "val/perplexity_len_512": 292.3371102776823}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 7924.9846870569745, "val/train_update_time": 4435.057907043141, "val/loss": 5.604714242819535, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.8660318760667, "val/val_tokens_per_second": 344589.61364762415, "val/loss_avg_len_2048": 5.604714242819535, "val/perplexity_len_2048": 271.7042728930096, "val/loss_avg_len_1024": 5.627045304850158, "val/perplexity_len_1024": 277.83997121976165, "val/loss_avg_len_512": 5.661085981074465, "val/perplexity_len_512": 287.4606499338183}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 8183.984758203966, "val/train_update_time": 4574.958235046361, "val/loss": 5.590083757191172, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.37259386607911, "val/val_tokens_per_second": 346026.04084472556, "val/loss_avg_len_2048": 5.590083757191172, "val/perplexity_len_2048": 267.7580454362757, "val/loss_avg_len_1024": 5.612942422863492, "val/perplexity_len_1024": 273.9491274375101, "val/loss_avg_len_512": 5.647777539000603, "val/perplexity_len_512": 283.66034071391516}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 8442.485554668936, "val/train_update_time": 4714.8530240497785, "val/loss": 5.574322423379664, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.52899952605367, "val/val_tokens_per_second": 345569.4400845478, "val/loss_avg_len_2048": 5.574322423379664, "val/perplexity_len_2048": 263.5709056357824, "val/loss_avg_len_1024": 5.597507244674646, "val/perplexity_len_1024": 269.75314005297037, "val/loss_avg_len_512": 5.633031649286917, "val/perplexity_len_512": 279.50820531578756}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 8701.162048408994, "val/train_update_time": 4854.753052946762, "val/loss": 5.560131643292463, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.59739387896843, "val/val_tokens_per_second": 345370.1524149906, "val/loss_avg_len_2048": 5.560131643292463, "val/perplexity_len_2048": 259.85704250803946, "val/loss_avg_len_1024": 5.583764273371181, "val/perplexity_len_1024": 266.0712881317231, "val/loss_avg_len_512": 5.620282325731626, "val/perplexity_len_512": 275.96728490305725}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 8959.925166409928, "val/train_update_time": 4994.663271273952, "val/loss": 5.546720407564999, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.35096273303498, "val/val_tokens_per_second": 343189.5232518534, "val/loss_avg_len_2048": 5.546720407564999, "val/perplexity_len_2048": 256.39530344075405, "val/loss_avg_len_1024": 5.570826970474656, "val/perplexity_len_1024": 262.6512142533728, "val/loss_avg_len_512": 5.607915740726924, "val/perplexity_len_512": 272.57552746979076}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 9220.005188893992, "val/train_update_time": 5134.568920494756, "val/loss": 5.53471503632843, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.7409812399419, "val/val_tokens_per_second": 344952.5140543637, "val/loss_avg_len_2048": 5.53471503632843, "val/perplexity_len_2048": 253.33558591144265, "val/loss_avg_len_1024": 5.559162426136544, "val/perplexity_len_1024": 259.60530661743275, "val/loss_avg_len_512": 5.596828284146019, "val/perplexity_len_512": 269.5700504807311}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 9478.914532593917, "val/train_update_time": 5274.477632154711, "val/loss": 5.523900157300186, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.79370030597784, "val/val_tokens_per_second": 344799.4287112786, "val/loss_avg_len_2048": 5.523900157300186, "val/perplexity_len_2048": 250.61055420070002, "val/loss_avg_len_1024": 5.5487858315113705, "val/perplexity_len_1024": 256.92541570504636, "val/loss_avg_len_512": 5.5871342391912835, "val/perplexity_len_512": 266.96945181888873}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 9737.887706990936, "val/train_update_time": 5414.394234810607, "val/loss": 5.513743215830738, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.89046923699789, "val/val_tokens_per_second": 344518.784919166, "val/loss_avg_len_2048": 5.513743215830738, "val/perplexity_len_2048": 248.07800074089812, "val/loss_avg_len_1024": 5.538833538455277, "val/perplexity_len_1024": 254.38110056013733, "val/loss_avg_len_512": 5.577749973074766, "val/perplexity_len_512": 264.47585801169674}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 9996.965703879949, "val/train_update_time": 5554.316489073681, "val/loss": 5.503565442725533, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.39389265398495, "val/val_tokens_per_second": 345963.7915589842, "val/loss_avg_len_2048": 5.503565442725533, "val/perplexity_len_2048": 245.56592449284855, "val/loss_avg_len_1024": 5.5291210433123865, "val/perplexity_len_1024": 251.92238481690993, "val/loss_avg_len_512": 5.568761392938602, "val/perplexity_len_512": 262.1092477350233}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 10255.549305356923, "val/train_update_time": 5694.239998900797, "val/loss": 5.495382670491731, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.52405338105746, "val/val_tokens_per_second": 345583.86109452986, "val/loss_avg_len_2048": 5.495382670491731, "val/perplexity_len_2048": 243.56471335816636, "val/loss_avg_len_1024": 5.521260392003751, "val/perplexity_len_1024": 249.9498735613677, "val/loss_avg_len_512": 5.5613877866359545, "val/perplexity_len_512": 260.1836653018214}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 10514.781046306947, "val/train_update_time": 5834.130611701286, "val/loss": 5.48703303927928, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.9761282489635, "val/val_tokens_per_second": 344270.7423987537, "val/loss_avg_len_2048": 5.48703303927928, "val/perplexity_len_2048": 241.53950446478578, "val/loss_avg_len_1024": 5.513113558725175, "val/perplexity_len_1024": 247.9218458321809, "val/loss_avg_len_512": 5.553624488961476, "val/perplexity_len_512": 258.17160228548784}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 10773.90538297291, "val/train_update_time": 5974.028263681685, "val/loss": 5.480334459244821, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.07169013505336, "val/val_tokens_per_second": 343994.44530889246, "val/loss_avg_len_2048": 5.480334459244821, "val/perplexity_len_2048": 239.9269397393566, "val/loss_avg_len_1024": 5.506674018040026, "val/perplexity_len_1024": 246.33047237547763, "val/loss_avg_len_512": 5.547568697430181, "val/perplexity_len_512": 256.61289325471205}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 11033.109079251997, "val/train_update_time": 6113.921902889037, "val/loss": 5.474184480784086, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.07278911396861, "val/val_tokens_per_second": 343991.2704219584, "val/loss_avg_len_2048": 5.474184480784086, "val/perplexity_len_2048": 238.4559222272448, "val/loss_avg_len_1024": 5.5007993760133225, "val/perplexity_len_1024": 244.88761132568, "val/loss_avg_len_512": 5.542205744937714, "val/perplexity_len_512": 255.24037416617912}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 11292.312823130982, "val/train_update_time": 6253.814012841205, "val/loss": 5.468393029721306, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.41189982998185, "val/val_tokens_per_second": 345911.1800318311, "val/loss_avg_len_2048": 5.468393029721306, "val/perplexity_len_2048": 237.07890772793579, "val/loss_avg_len_1024": 5.495152589550743, "val/perplexity_len_1024": 243.50868020605293, "val/loss_avg_len_512": 5.5369287875737765, "val/perplexity_len_512": 253.8970290987337}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 11550.859713669983, "val/train_update_time": 6393.710524166352, "val/loss": 5.463633170905521, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.55901667405851, "val/val_tokens_per_second": 345481.94771728665, "val/loss_avg_len_2048": 5.463633170905521, "val/perplexity_len_2048": 235.95312700310126, "val/loss_avg_len_1024": 5.49054511497196, "val/perplexity_len_1024": 242.38930088842343, "val/loss_avg_len_512": 5.532540454801778, "val/perplexity_len_512": 252.78528557959896}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 11810.107688057935, "val/train_update_time": 6533.614025922609, "val/loss": 5.45942693974147, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.30903941800352, "val/val_tokens_per_second": 346211.9226180359, "val/loss_avg_len_2048": 5.45942693974147, "val/perplexity_len_2048": 234.9627379698483, "val/loss_avg_len_1024": 5.486521510563948, "val/perplexity_len_1024": 241.41598166778738, "val/loss_avg_len_512": 5.528677167296701, "val/perplexity_len_512": 251.8105873264233}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 12068.55913664191, "val/train_update_time": 6673.514490786707, "val/loss": 5.455808992347491, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.4850783480797, "val/val_tokens_per_second": 345697.539057785, "val/loss_avg_len_2048": 5.455808992347491, "val/perplexity_len_2048": 234.11419106894314, "val/loss_avg_len_1024": 5.483065307796094, "val/perplexity_len_1024": 240.58303932171194, "val/loss_avg_len_512": 5.525581023516459, "val/perplexity_len_512": 251.03215123954533}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 12327.182909888914, "val/train_update_time": 6813.415315568796, "val/loss": 5.453029937104025, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.3482397699263, "val/val_tokens_per_second": 346097.2472394002, "val/loss_avg_len_2048": 5.453029937104025, "val/perplexity_len_2048": 233.46447801107593, "val/loss_avg_len_1024": 5.480366042145022, "val/perplexity_len_1024": 239.93451744761228, "val/loss_avg_len_512": 5.522991253441363, "val/perplexity_len_512": 250.38287678504003}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 12585.657294680947, "val/train_update_time": 6953.310590816545, "val/loss": 5.450687895292766, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.0683747169096, "val/val_tokens_per_second": 344004.02371649264, "val/loss_avg_len_2048": 5.450687895292766, "val/perplexity_len_2048": 232.918334237547, "val/loss_avg_len_1024": 5.478113070084702, "val/perplexity_len_1024": 239.39456016622754, "val/loss_avg_len_512": 5.520896892670367, "val/perplexity_len_512": 249.85903346010718}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 12844.891541385907, "val/train_update_time": 7093.223455801839, "val/loss": 5.4488968286233375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.42289469996467, "val/val_tokens_per_second": 342982.8099788316, "val/loss_avg_len_2048": 5.4488968286233375, "val/perplexity_len_2048": 232.5015353411223, "val/loss_avg_len_1024": 5.4763672707772235, "val/perplexity_len_1024": 238.97698991186206, "val/loss_avg_len_512": 5.519267403276684, "val/perplexity_len_512": 249.45222235237557}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 13105.045270575909, "val/train_update_time": 7233.1264379567, "val/loss": 5.447661420757801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.2398596740095, "val/val_tokens_per_second": 343509.293888644, "val/loss_avg_len_2048": 5.447661420757801, "val/perplexity_len_2048": 232.2144784682815, "val/loss_avg_len_1024": 5.475185846520326, "val/perplexity_len_1024": 238.69482341113377, "val/loss_avg_len_512": 5.51811999600099, "val/perplexity_len_512": 249.16616320206958}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 13364.448509541922, "val/train_update_time": 7373.0204275009455, "val/loss": 5.446748636367329, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.28055162995588, "val/val_tokens_per_second": 343392.1074331567, "val/loss_avg_len_2048": 5.446748636367329, "val/perplexity_len_2048": 232.00261342535612, "val/loss_avg_len_1024": 5.474293929352122, "val/perplexity_len_1024": 238.48202231475486, "val/loss_avg_len_512": 5.517301595225371, "val/perplexity_len_512": 248.96232884132354}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 13623.899586633896, "val/train_update_time": 7512.917904903879, "val/loss": 5.446271972442274, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.65777256898582, "val/val_tokens_per_second": 345194.4117372208, "val/loss_avg_len_2048": 5.446271972442274, "val/perplexity_len_2048": 231.89205250131306, "val/loss_avg_len_1024": 5.473838324508781, "val/perplexity_len_1024": 238.3733934981252, "val/loss_avg_len_512": 5.516900151270465, "val/perplexity_len_512": 248.86240447776873}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 13882.689546601963, "val/train_update_time": 7652.812096997048, "val/loss": 5.446056043829591, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.43243380391505, "val/val_tokens_per_second": 342955.41583995847, "val/loss_avg_len_2048": 5.446056043829591, "val/perplexity_len_2048": 231.84198577773796, "val/loss_avg_len_1024": 5.473623460931872, "val/perplexity_len_1024": 238.32218124017967, "val/loss_avg_len_512": 5.516683546149684, "val/perplexity_len_512": 248.80850544420304}
metrics/npz/train_eval/step-000000104857600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46d5eeaf619fff91f6769b36b947bb6151be71389b99d3965e5fe8d305cfc80b
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebe2af4f21c7e0d61033e499ab1f560171c17e9433e1840361ec83d995a4e693
3
  size 20540
metrics/npz/train_eval/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bcdafb18d49f2053eb816fee2f41057351c153f6f62ebab5f0f3a1254268e41
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b27be315fb8b50e4a1294c44349c97e37ecee415e1d751c416e137a7f019b91d
3
  size 20540
metrics/npz/train_eval/step-000000314572800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:502b14d4469fdbc008fd51c6f231291e0db14eea1bf21b86d7668ac994cb2817
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43650bcb683fadeca0bc902154b85c1478ddf4ba8c964c9a7c8e60133a2d4543
3
  size 20540
metrics/npz/train_eval/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f828ddbf4a9e273f5a0db9f7474f8a1186b20a34caf64667b1a93a3b351056fa
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24675daf5b5a30e28d02aeab0a297fc6340ddc472810d7c76fa77831f9b13628
3
  size 20540
metrics/npz/train_eval/step-000000524288000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:792cc1830dd5ddadf8d7a660f31057469491412804fdec8574e15f508939bcca
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e06512f1cc6a99bd126fb9509e9ec8b4e71c245058ce2c5afa7079be6c88168c
3
  size 20540
metrics/npz/train_eval/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:654395b896450d4a2d4c113bdd12d8e5a699ba194a000090d198f0f001e4b09d
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e569fd791b0d40a459be09259ecf03f658ef5a7548626494d29f910625cd9e0c
3
  size 20540
metrics/npz/train_eval/step-000000734003200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32907bf38b7fe93b2344baf03c17af64b78055744f7eebaf8bf3b0a70ca43d77
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db454625a77c3c13a01875a67b4346c05c4c22769385e9abff513a392ed03fda
3
  size 20540
metrics/npz/train_eval/step-000000838860800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ff22b7cfe3b44873d2335db04e807858d7611e560c1ff8e94ef9900106c4e3d
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfe8ed28d202e36594e637d3e893ec7bc8a9804f862d4cc10d386991c35f5427
3
  size 20540
metrics/npz/train_eval/step-000000943718400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d021f561209e579353a2c6d14f1a606e34e108583cdf3e1b68be7ce42013ebd
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29de7c0b6c6d0a60fb84f581d2a82c5d6a2e3bce3ba2d427c983ff440bb33873
3
  size 20540
metrics/npz/train_eval/step-000001048576000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55190da526839675bbce5fa18ed9efbbebe5a1609c5a4c3e7f1b444981a9d93f
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca5679212a6611bb7be7ed25d531e2f54a00a24248d69d75ef32be1a83597fc7
3
  size 20540
metrics/npz/train_eval/step-000001153433600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0d567b1c640f784b7b006b46ea9d4dbddc410cc0285762ebff006d5eb5c780c
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce50ba7c9f1f4a503794de94d3311305a62ee74cd23ccedfd16d877a5c6fcffd
3
  size 20540
metrics/npz/train_eval/step-000001258291200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:960ac0d80bbfa4c9a71b46e1ae1549317fe62a6dd66f542043386b7df251b9c0
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee2dce443eb1c38a07e162379ba0638092c32d778cf325c7bfd8cc45fae6012a
3
  size 20540
metrics/npz/train_eval/step-000001363148800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9486583dd0d5a62ab506e30cdf5110f952832d8dfeb0db666298e1d2079eb163
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bb7178988be17dd5cf70feb006d231677c1d56581bfe5daaabb72d7097c03c4
3
  size 20540
metrics/npz/train_eval/step-000001468006400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a422f4d160c9929f6e68890b9d4b80561160c9df6d2e2238434fcad6c32dcdd
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e1750313e400d5c97e86d9af029477b578f97cec1a54d15525003e9ff97d42
3
  size 20540
metrics/npz/train_eval/step-000001572864000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7720a0b84777d788204d20cefd26ba14427b8df1243e10d67e2dbfe56e797470
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48dc7d3a4adb3884b80a0e04cf96d710d871d6690b4ff0475e98776bea507cc6
3
  size 20540
metrics/npz/train_eval/step-000001677721600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fff9dfa360488578ac18416a711f4db207adf4d48dfcf535f477aca3d8611d5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c44931dbae1f81becc0e142591963ae32aec2c1342603c2ac058471f14a7330d
3
  size 20540
metrics/npz/train_eval/step-000001782579200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57380069d4cd91076bdeca446fed3d4f796745db9fafaed9258def379abb69af
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a87dabfb6d488cadb9789cc73d79fd5d94feedf35803f6dfcf616f7b12cb6c6c
3
  size 20540
metrics/npz/train_eval/step-000001887436800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c34352c17bd384bc40993e43e8d44a9ddfd209b4bd167880f05fb0bb709ebf88
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52e60241e961a954d8c28410846a794df492a465f85a8df23a2b8cf2b56efff3
3
  size 20540
metrics/npz/train_eval/step-000001992294400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2949c354c795ba5a99983e2acad7fb79efdc3dcf50ff5783af2faa60a1cdcb04
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e2f00d1745798d0cf989730009f629f34788c0e91ab471c5a5a711ba8c71337
3
  size 20540
metrics/npz/val/step-000000041943040.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c33eb8d1dc48547f714dba1ec5b673736a84a48004cb857195490f33b40c9abf
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:417b767d4fe37036ea1b72a1b7b75bacc99cdee5a8335eb67a2a9dc0ba618302
3
  size 21142
metrics/npz/val/step-000000083886080.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0ce5f774bb97ff2d2e4856c9ebdb054d9bdf8b1ad052f46999d2b501a16dc33
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bcc085766a45046f7ef30e48b25da2421887464722c53f3ea61024b334b6512
3
  size 21142
metrics/npz/val/step-000000125829120.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:308d5fda9f42be72c08eb8fccb6954800855f5374a72b528cbfc348248f80507
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eddf97a53501003d621a22d3fab9049de2ef47e828e4936b93287e65f15fa940
3
  size 21142
metrics/npz/val/step-000000167772160.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93fe6335aa4aef7c61d06213c7a2a60ecef7dc728def4a395836f6932fefe16e
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b40daa279004faa8e35d4ef549e09b15b591cd201f1a64d31b0ab4ef236683
3
  size 21142
metrics/npz/val/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:857e15df6aea28826696d304204e473913a5c7061f325544325008f0375f654f
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e3b720c531041ae771dd09fbb8f987216e302645e85bf5d7fa11d8109595c6a
3
  size 21142
metrics/npz/val/step-000000251658240.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f68781f2420a289ce64f25cf1c4582230b5f340ac015164a674759d456a2d07b
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46738cb86d77f2dfeb34c5240c99d0994b2c21c3e628e8deaa6808052bc6c392
3
  size 21142
metrics/npz/val/step-000000293601280.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38c4752df0469938251e6fb8cb69bcac6fad05876fae1e28d61e2a674e563220
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:017553bdfcefb2ab3bc3d6874c2f7b842f3dab8362c02bb94f4e4e2d0aacee1f
3
  size 21142
metrics/npz/val/step-000000335544320.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56033fd9457d915002c9e53bdf90c7dea4f2227d0800a42883eee1c5d0846c5a
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7674fbb1508dda2c7a9fb9eb273312bdc7ff3f13e49dae287c9700442308fdb
3
  size 21142
metrics/npz/val/step-000000377487360.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d98a0d33f36a94ede2c69a2a5443fa54323dc2a5a5ff513c7dea712c868e4570
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da5eaa606f68afeac16287d3060ddb68697071ca07d06999c4651fabb7f73302
3
  size 21142
metrics/npz/val/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14b0f2f70cc3c2f96dcb3cdaa74e1e8bdfa9978c03f30cf9695214e2fdc560c0
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a7fd68774ed286c16d89b1c1daf90290d97796f98c646687923a9a9f7af869
3
  size 21142
metrics/npz/val/step-000000461373440.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:699afee4a320a16acf289e945afd023859070c7978788c12f5c4d9f39752b31c
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:016c8f904334cf8409d2b094468f842db1a95d80d73508dfd685cf4acca35e68
3
  size 21142
metrics/npz/val/step-000000503316480.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf802899cd7020d3e12ed513a722405d30325561ba4fe7df8447a5637df6a37a
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1f7312f50a260dc0c7896b1b4b470e42caebfc963de6e73d582cb8265711181
3
  size 21142
metrics/npz/val/step-000000545259520.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75d53b49acf830a4190f08ade7c15b8a2b38cbc19031bcc9f166b4cb0ce05923
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40b8ab66312b12f7ec52e87910df1bbbcd2e1e96fb0484c66a80964a67145eff
3
  size 21142
metrics/npz/val/step-000000587202560.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1392685419d457e0b9be7f38b289ede3ccdd951302da53f259ffd776ff725e9
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2257f97e00c17e884cf0dbdccd5fb80af23c64d3f14ef51ab59848646e2c7e9c
3
  size 21142
metrics/npz/val/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fa5fb1b11a581e9d9f0cff4d44ac30da559bf60362d974a116446319bc862d1
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a32f40fcc0a41995711d030ac71f9f82170d19a8b66436eb8e5361659bcd0f71
3
  size 21142