add remote code + model files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- checkpoints/step-000000209715200.pt +2 -2
- checkpoints/step-000000419430400.pt +2 -2
- checkpoints/step-000000629145600.pt +2 -2
- checkpoints/step-000000838860800.pt +2 -2
- checkpoints/step-000001048576000.pt +2 -2
- checkpoints/step-000001258291200.pt +2 -2
- checkpoints/step-000001468006400.pt +2 -2
- checkpoints/step-000001677721600.pt +2 -2
- checkpoints/step-000001887436800.pt +2 -2
- logs/2025-10-28_04-46-01.log +258 -0
- metrics/jsonlines/checkpoint.jsonl +9 -9
- metrics/jsonlines/norm.jsonl +0 -0
- metrics/jsonlines/throughput.jsonl +0 -0
- metrics/jsonlines/train.jsonl +98 -98
- metrics/jsonlines/train_eval.jsonl +19 -19
- metrics/jsonlines/val.jsonl +49 -49
- metrics/npz/train_eval/step-000000104857600.npz +1 -1
- metrics/npz/train_eval/step-000000209715200.npz +1 -1
- metrics/npz/train_eval/step-000000314572800.npz +1 -1
- metrics/npz/train_eval/step-000000419430400.npz +1 -1
- metrics/npz/train_eval/step-000000524288000.npz +1 -1
- metrics/npz/train_eval/step-000000629145600.npz +1 -1
- metrics/npz/train_eval/step-000000734003200.npz +1 -1
- metrics/npz/train_eval/step-000000838860800.npz +1 -1
- metrics/npz/train_eval/step-000000943718400.npz +1 -1
- metrics/npz/train_eval/step-000001048576000.npz +1 -1
- metrics/npz/train_eval/step-000001153433600.npz +1 -1
- metrics/npz/train_eval/step-000001258291200.npz +1 -1
- metrics/npz/train_eval/step-000001363148800.npz +1 -1
- metrics/npz/train_eval/step-000001468006400.npz +1 -1
- metrics/npz/train_eval/step-000001572864000.npz +1 -1
- metrics/npz/train_eval/step-000001677721600.npz +1 -1
- metrics/npz/train_eval/step-000001782579200.npz +1 -1
- metrics/npz/train_eval/step-000001887436800.npz +1 -1
- metrics/npz/train_eval/step-000001992294400.npz +1 -1
- metrics/npz/val/step-000000041943040.npz +1 -1
- metrics/npz/val/step-000000083886080.npz +1 -1
- metrics/npz/val/step-000000125829120.npz +1 -1
- metrics/npz/val/step-000000167772160.npz +1 -1
- metrics/npz/val/step-000000209715200.npz +1 -1
- metrics/npz/val/step-000000251658240.npz +1 -1
- metrics/npz/val/step-000000293601280.npz +1 -1
- metrics/npz/val/step-000000335544320.npz +1 -1
- metrics/npz/val/step-000000377487360.npz +1 -1
- metrics/npz/val/step-000000419430400.npz +1 -1
- metrics/npz/val/step-000000461373440.npz +1 -1
- metrics/npz/val/step-000000503316480.npz +1 -1
- metrics/npz/val/step-000000545259520.npz +1 -1
- metrics/npz/val/step-000000587202560.npz +1 -1
- metrics/npz/val/step-000000629145600.npz +1 -1
checkpoints/step-000000209715200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac7cd25dae4d940b58ffaafa680de2ae0c45fff95f04f5b6bc6b4476a7a2034a
|
| 3 |
+
size 339650634
|
checkpoints/step-000000419430400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec37c0723da099092a621ff3ba55aee2e0188811a9c519bdfccad886a91e2a28
|
| 3 |
+
size 339650634
|
checkpoints/step-000000629145600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebefceb30583dd247c6dde4b265be6dc08351a8e715ded1816b0bfb461c90b81
|
| 3 |
+
size 339650634
|
checkpoints/step-000000838860800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd061a114516812d0e8647aec34a4d751592228aede425a7ef1bd4448e3b3988
|
| 3 |
+
size 339650634
|
checkpoints/step-000001048576000.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed20ede365ff3caab637f683462cb33d745b91f8e5805463520dd0d4fe343a57
|
| 3 |
+
size 339650634
|
checkpoints/step-000001258291200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59e0dafe1c8d7ec1fc8e05118a3dc7eb9c3bf7e740f72ef2ea38923e36287366
|
| 3 |
+
size 339650634
|
checkpoints/step-000001468006400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c33a92e6f8f02bddbab0c570e12617e75c846b54170e6f743ffeeb2fe8e559aa
|
| 3 |
+
size 339650634
|
checkpoints/step-000001677721600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:479347aabec04b87dc513c6cdb01c20472deb8549e1ce8fe69ade13b18d9cc67
|
| 3 |
+
size 339650634
|
checkpoints/step-000001887436800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e33ee7a09b18f77b4d3220a55cc5b0c9db4d86fb4c43b1ccdecd688ae2c8184f
|
| 3 |
+
size 339650634
|
logs/2025-10-28_04-46-01.log
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-10-28 04:46:01][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_3_4_256`
|
| 2 |
+
[2025-10-28 04:46:01][train:375][INFO] Configuration:
|
| 3 |
+
[2025-10-28 04:46:01][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_3_4_256/config.yaml.
|
| 4 |
+
[2025-10-28 04:46:01][train:387][INFO] creating datamodule
|
| 5 |
+
[2025-10-28 04:46:01][train:419][INFO] creating model
|
| 6 |
+
[2025-10-28 04:46:02][train:440][INFO] creating optimizer
|
| 7 |
+
[2025-10-28 04:46:02][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
|
| 8 |
+
[2025-10-28 04:46:02][logger:256][INFO] Setting up wandb logger...
|
| 9 |
+
[2025-10-28 04:46:02][logger:272][INFO] Not resuming. Creating a new wandb run.
|
| 10 |
+
[2025-10-28 04:46:03][logger:288][INFO] wandb initialized. Run id: 0jt2hdhs
|
| 11 |
+
[2025-10-28 04:46:03][logger:186][INFO] Setting up jsonlines logger...
|
| 12 |
+
[2025-10-28 04:46:03][logger:113][INFO] Setting up npz logger...
|
| 13 |
+
[2025-10-28 04:46:03][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
|
| 14 |
+
[2025-10-28 04:46:03][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
|
| 15 |
+
[2025-10-28 04:46:03][logger:171][INFO] [step: 0] [model_info/total_params: 28299520] [model_info/trainable_params: 28299520] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 15428608]
|
| 16 |
+
[2025-10-28 04:49:38][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:03:32] [ETA: 5:50:43] [loss: 9.776] [tokens/s: 100709.527] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000]
|
| 17 |
+
[2025-10-28 04:52:17][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:06:11] [ETA: 5:03:11] [loss: 8.175] [tokens/s: 115124.074] [batches/s: 0.055] [MFU: 0.000] [TFLOPS: 0.000]
|
| 18 |
+
[2025-10-28 04:52:17][train:194][INFO] Running validation...
|
| 19 |
+
[2025-10-28 04:56:45][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 371.256] [val/train_update_time: 370.747] [val/loss: 8.075] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 268.567] [val/val_tokens_per_second: 152513.300] [val/loss_avg_len_2048: 8.075] [val/perplexity_len_2048: 3211.761] [val/loss_avg_len_1024: 8.077] [val/perplexity_len_1024: 3220.412] [val/loss_avg_len_512: 8.078] [val/perplexity_len_512: 3222.268]
|
| 20 |
+
[2025-10-28 05:00:04][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:13:58] [ETA: 7:31:48] [loss: 7.702] [tokens/s: 74781.993] [batches/s: 0.036] [MFU: 0.000] [TFLOPS: 0.000]
|
| 21 |
+
[2025-10-28 05:02:36][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:16:29] [ETA: 6:35:57] [loss: 7.456] [tokens/s: 84778.614] [batches/s: 0.040] [MFU: 0.000] [TFLOPS: 0.000]
|
| 22 |
+
[2025-10-28 05:02:36][train:194][INFO] Running validation...
|
| 23 |
+
[2025-10-28 05:07:21][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 989.879] [val/train_update_time: 720.451] [val/loss: 7.447] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 285.629] [val/val_tokens_per_second: 143403.004] [val/loss_avg_len_2048: 7.447] [val/perplexity_len_2048: 1714.777] [val/loss_avg_len_1024: 7.451] [val/perplexity_len_1024: 1721.292] [val/loss_avg_len_512: 7.453] [val/perplexity_len_512: 1725.579]
|
| 24 |
+
[2025-10-28 05:10:20][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:24:14] [ETA: 7:40:35] [loss: 7.305] [tokens/s: 71892.276] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
|
| 25 |
+
[2025-10-28 05:10:20][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1454.511] [train_eval/train_update_time: 899.264] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.403] [train_eval/perplexity_len_2048: 4459.633] [train_eval/loss_avg_len_1024: 8.407] [train_eval/perplexity_len_1024: 4479.883] [train_eval/loss_avg_len_512: 8.407] [train_eval/perplexity_len_512: 4480.109]
|
| 26 |
+
[2025-10-28 05:12:52][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:26:46] [ETA: 6:59:23] [loss: 7.140] [tokens/s: 78261.419] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
|
| 27 |
+
[2025-10-28 05:12:52][train:194][INFO] Running validation...
|
| 28 |
+
[2025-10-28 05:17:57][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 1606.152] [val/train_update_time: 1050.734] [val/loss: 7.138] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 304.819] [val/val_tokens_per_second: 134374.611] [val/loss_avg_len_2048: 7.138] [val/perplexity_len_2048: 1259.536] [val/loss_avg_len_1024: 7.144] [val/perplexity_len_1024: 1265.973] [val/loss_avg_len_512: 7.149] [val/perplexity_len_512: 1272.675]
|
| 29 |
+
[2025-10-28 05:20:32][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:34:26] [ETA: 7:37:39] [loss: 7.048] [tokens/s: 70873.110] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
|
| 30 |
+
[2025-10-28 05:23:04][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:36:58] [ETA: 7:05:12] [loss: 6.937] [tokens/s: 75536.064] [batches/s: 0.036] [MFU: 0.000] [TFLOPS: 0.000]
|
| 31 |
+
[2025-10-28 05:23:04][train:194][INFO] Running validation...
|
| 32 |
+
[2025-10-28 05:25:07][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 2218.467] [val/train_update_time: 1357.901] [val/loss: 6.919] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 122.971] [val/val_tokens_per_second: 333085.335] [val/loss_avg_len_2048: 6.919] [val/perplexity_len_2048: 1010.819] [val/loss_avg_len_1024: 6.925] [val/perplexity_len_1024: 1017.180] [val/loss_avg_len_512: 6.932] [val/perplexity_len_512: 1024.379]
|
| 33 |
+
[2025-10-28 05:26:17][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:40:11] [ETA: 6:46:23] [loss: 6.817] [tokens/s: 78213.305] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
|
| 34 |
+
[2025-10-28 05:27:27][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:41:21] [ETA: 6:12:14] [loss: 6.740] [tokens/s: 84519.766] [batches/s: 0.040] [MFU: 0.000] [TFLOPS: 0.000]
|
| 35 |
+
[2025-10-28 05:27:27][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2481.588] [train_eval/train_update_time: 1497.782] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.995] [train_eval/perplexity_len_2048: 1091.206] [train_eval/loss_avg_len_1024: 7.002] [train_eval/perplexity_len_1024: 1098.950] [train_eval/loss_avg_len_512: 7.007] [train_eval/perplexity_len_512: 1104.537]
|
| 36 |
+
[2025-10-28 05:27:27][train:194][INFO] Running validation...
|
| 37 |
+
[2025-10-28 05:29:26][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 2481.588] [val/train_update_time: 1497.782] [val/loss: 6.731] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.639] [val/val_tokens_per_second: 345248.821] [val/loss_avg_len_2048: 6.731] [val/perplexity_len_2048: 838.186] [val/loss_avg_len_1024: 6.739] [val/perplexity_len_1024: 844.594] [val/loss_avg_len_512: 6.748] [val/perplexity_len_512: 852.353]
|
| 38 |
+
[2025-10-28 05:29:26][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000209715200.pt...
|
| 39 |
+
[2025-10-28 05:29:26][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000209715200.pt.
|
| 40 |
+
[2025-10-28 05:29:26][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.458]
|
| 41 |
+
[2025-10-28 05:30:36][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:44:30] [ETA: 6:00:08] [loss: 6.661] [tokens/s: 85187.454] [batches/s: 0.041] [MFU: 0.000] [TFLOPS: 0.000]
|
| 42 |
+
[2025-10-28 05:31:46][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:45:40] [ETA: 5:34:59] [loss: 6.557] [tokens/s: 99809.612] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000]
|
| 43 |
+
[2025-10-28 05:31:46][train:194][INFO] Running validation...
|
| 44 |
+
[2025-10-28 05:33:45][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 2740.807] [val/train_update_time: 1637.629] [val/loss: 6.574] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.797] [val/val_tokens_per_second: 344789.690] [val/loss_avg_len_2048: 6.574] [val/perplexity_len_2048: 716.130] [val/loss_avg_len_1024: 6.583] [val/perplexity_len_1024: 722.547] [val/loss_avg_len_512: 6.594] [val/perplexity_len_512: 730.627]
|
| 45 |
+
[2025-10-28 05:34:55][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:48:49] [ETA: 5:26:46] [loss: 6.521] [tokens/s: 100003.213] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000]
|
| 46 |
+
[2025-10-28 05:36:05][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:49:59] [ETA: 5:07:06] [loss: 6.479] [tokens/s: 121898.441] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
|
| 47 |
+
[2025-10-28 05:36:05][train:194][INFO] Running validation...
|
| 48 |
+
[2025-10-28 05:38:04][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 2999.716] [val/train_update_time: 1777.493] [val/loss: 6.450] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.528] [val/val_tokens_per_second: 345572.819] [val/loss_avg_len_2048: 6.450] [val/perplexity_len_2048: 632.679] [val/loss_avg_len_1024: 6.460] [val/perplexity_len_1024: 638.988] [val/loss_avg_len_512: 6.473] [val/perplexity_len_512: 647.175]
|
| 49 |
+
[2025-10-28 05:39:14][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:53:08] [ETA: 5:01:06] [loss: 6.371] [tokens/s: 120805.809] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
|
| 50 |
+
[2025-10-28 05:39:14][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3188.291] [train_eval/train_update_time: 1847.413] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.550] [train_eval/perplexity_len_2048: 699.420] [train_eval/loss_avg_len_1024: 6.559] [train_eval/perplexity_len_1024: 705.273] [train_eval/loss_avg_len_512: 6.569] [train_eval/perplexity_len_512: 712.794]
|
| 51 |
+
[2025-10-28 05:40:24][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:54:18] [ETA: 4:45:06] [loss: 6.371] [tokens/s: 156363.365] [batches/s: 0.075] [MFU: 0.000] [TFLOPS: 0.000]
|
| 52 |
+
[2025-10-28 05:40:24][train:194][INFO] Running validation...
|
| 53 |
+
[2025-10-28 05:42:23][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 3258.343] [val/train_update_time: 1917.340] [val/loss: 6.350] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.164] [val/val_tokens_per_second: 343728.122] [val/loss_avg_len_2048: 6.350] [val/perplexity_len_2048: 572.748] [val/loss_avg_len_1024: 6.361] [val/perplexity_len_1024: 578.982] [val/loss_avg_len_512: 6.375] [val/perplexity_len_512: 587.102]
|
| 54 |
+
[2025-10-28 05:43:33][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:57:27] [ETA: 4:40:32] [loss: 6.268] [tokens/s: 152047.726] [batches/s: 0.073] [MFU: 0.000] [TFLOPS: 0.000]
|
| 55 |
+
[2025-10-28 05:44:43][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:58:37] [ETA: 4:27:04] [loss: 6.241] [tokens/s: 177579.011] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 56 |
+
[2025-10-28 05:44:43][train:194][INFO] Running validation...
|
| 57 |
+
[2025-10-28 05:46:42][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 3517.635] [val/train_update_time: 2057.194] [val/loss: 6.245] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.162] [val/val_tokens_per_second: 343734.292] [val/loss_avg_len_2048: 6.245] [val/perplexity_len_2048: 515.461] [val/loss_avg_len_1024: 6.257] [val/perplexity_len_1024: 521.559] [val/loss_avg_len_512: 6.273] [val/perplexity_len_512: 529.853]
|
| 58 |
+
[2025-10-28 05:47:52][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 1:01:46] [ETA: 4:23:22] [loss: 6.249] [tokens/s: 161153.155] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 59 |
+
[2025-10-28 05:49:03][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 1:02:56] [ETA: 4:11:47] [loss: 6.173] [tokens/s: 177572.226] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 60 |
+
[2025-10-28 05:49:03][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3776.912] [train_eval/train_update_time: 2197.046] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.273] [train_eval/perplexity_len_2048: 530.052] [train_eval/loss_avg_len_1024: 6.284] [train_eval/perplexity_len_1024: 536.160] [train_eval/loss_avg_len_512: 6.299] [train_eval/perplexity_len_512: 543.777]
|
| 61 |
+
[2025-10-28 05:49:03][train:194][INFO] Running validation...
|
| 62 |
+
[2025-10-28 05:51:02][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 3776.912] [val/train_update_time: 2197.046] [val/loss: 6.165] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.318] [val/val_tokens_per_second: 343285.098] [val/loss_avg_len_2048: 6.165] [val/perplexity_len_2048: 475.950] [val/loss_avg_len_1024: 6.178] [val/perplexity_len_1024: 481.995] [val/loss_avg_len_512: 6.195] [val/perplexity_len_512: 490.469]
|
| 63 |
+
[2025-10-28 05:51:02][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000419430400.pt...
|
| 64 |
+
[2025-10-28 05:51:02][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000419430400.pt.
|
| 65 |
+
[2025-10-28 05:51:02][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.445]
|
| 66 |
+
[2025-10-28 05:52:12][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 1:06:06] [ETA: 4:08:42] [loss: 6.121] [tokens/s: 161071.815] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 67 |
+
[2025-10-28 05:53:22][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 1:07:16] [ETA: 3:58:32] [loss: 6.109] [tokens/s: 177424.503] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 68 |
+
[2025-10-28 05:53:22][train:194][INFO] Running validation...
|
| 69 |
+
[2025-10-28 05:55:22][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 4036.800] [val/train_update_time: 2336.909] [val/loss: 6.096] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.275] [val/val_tokens_per_second: 343408.828] [val/loss_avg_len_2048: 6.096] [val/perplexity_len_2048: 444.103] [val/loss_avg_len_1024: 6.110] [val/perplexity_len_1024: 450.165] [val/loss_avg_len_512: 6.129] [val/perplexity_len_512: 458.791]
|
| 70 |
+
[2025-10-28 05:56:32][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 1:10:26] [ETA: 3:55:48] [loss: 6.068] [tokens/s: 161014.445] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 71 |
+
[2025-10-28 05:57:42][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 1:11:36] [ETA: 3:46:44] [loss: 6.020] [tokens/s: 177313.805] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 72 |
+
[2025-10-28 05:57:42][train:194][INFO] Running validation...
|
| 73 |
+
[2025-10-28 05:59:41][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 4296.169] [val/train_update_time: 2476.763] [val/loss: 6.035] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.149] [val/val_tokens_per_second: 343770.380] [val/loss_avg_len_2048: 6.035] [val/perplexity_len_2048: 417.991] [val/loss_avg_len_1024: 6.050] [val/perplexity_len_1024: 424.000] [val/loss_avg_len_512: 6.070] [val/perplexity_len_512: 432.644]
|
| 74 |
+
[2025-10-28 06:00:51][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 1:14:45] [ETA: 3:44:16] [loss: 6.010] [tokens/s: 160932.666] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 75 |
+
[2025-10-28 06:00:51][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4485.400] [train_eval/train_update_time: 2546.702] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.082] [train_eval/perplexity_len_2048: 438.072] [train_eval/loss_avg_len_1024: 6.093] [train_eval/perplexity_len_1024: 442.860] [train_eval/loss_avg_len_512: 6.110] [train_eval/perplexity_len_512: 450.263]
|
| 76 |
+
[2025-10-28 06:02:01][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 1:15:55] [ETA: 3:36:05] [loss: 5.976] [tokens/s: 177309.095] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 77 |
+
[2025-10-28 06:02:01][train:194][INFO] Running validation...
|
| 78 |
+
[2025-10-28 06:04:03][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 4555.463] [val/train_update_time: 2616.634] [val/loss: 5.979] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 121.943] [val/val_tokens_per_second: 335894.395] [val/loss_avg_len_2048: 5.979] [val/perplexity_len_2048: 395.024] [val/loss_avg_len_1024: 5.994] [val/perplexity_len_1024: 401.092] [val/loss_avg_len_512: 6.016] [val/perplexity_len_512: 409.896]
|
| 79 |
+
[2025-10-28 06:05:13][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 1:19:07] [ETA: 3:33:55] [loss: 5.971] [tokens/s: 160581.929] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 80 |
+
[2025-10-28 06:06:23][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 1:20:17] [ETA: 3:26:28] [loss: 5.917] [tokens/s: 176879.876] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
|
| 81 |
+
[2025-10-28 06:06:23][train:194][INFO] Running validation...
|
| 82 |
+
[2025-10-28 06:08:23][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 4817.595] [val/train_update_time: 2756.546] [val/loss: 5.932] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.646] [val/val_tokens_per_second: 342343.152] [val/loss_avg_len_2048: 5.932] [val/perplexity_len_2048: 376.835] [val/loss_avg_len_1024: 5.948] [val/perplexity_len_1024: 382.882] [val/loss_avg_len_512: 5.970] [val/perplexity_len_512: 391.653]
|
| 83 |
+
[2025-10-28 06:09:33][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 1:23:27] [ETA: 3:24:19] [loss: 5.883] [tokens/s: 160515.361] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 84 |
+
[2025-10-28 06:10:43][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 1:24:37] [ETA: 3:17:27] [loss: 5.869] [tokens/s: 176891.777] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
|
| 85 |
+
[2025-10-28 06:10:43][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5077.398] [train_eval/train_update_time: 2896.437] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.951] [train_eval/perplexity_len_2048: 383.983] [train_eval/loss_avg_len_1024: 5.962] [train_eval/perplexity_len_1024: 388.223] [train_eval/loss_avg_len_512: 5.980] [train_eval/perplexity_len_512: 395.637]
|
| 86 |
+
[2025-10-28 06:10:43][train:194][INFO] Running validation...
|
| 87 |
+
[2025-10-28 06:12:42][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 5077.398] [val/train_update_time: 2896.437] [val/loss: 5.892] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.627] [val/val_tokens_per_second: 345285.072] [val/loss_avg_len_2048: 5.892] [val/perplexity_len_2048: 362.147] [val/loss_avg_len_1024: 5.908] [val/perplexity_len_1024: 368.146] [val/loss_avg_len_512: 5.932] [val/perplexity_len_512: 376.982]
|
| 88 |
+
[2025-10-28 06:12:42][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000629145600.pt...
|
| 89 |
+
[2025-10-28 06:12:42][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000629145600.pt.
|
| 90 |
+
[2025-10-28 06:12:42][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.544]
|
| 91 |
+
[2025-10-28 06:13:52][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 1:27:46] [ETA: 3:15:22] [loss: 5.913] [tokens/s: 160585.573] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 92 |
+
[2025-10-28 06:15:02][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 1:28:56] [ETA: 3:09:00] [loss: 5.830] [tokens/s: 176905.674] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
|
| 93 |
+
[2025-10-28 06:15:02][train:194][INFO] Running validation...
|
| 94 |
+
[2025-10-28 06:17:01][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 5336.697] [val/train_update_time: 3036.310] [val/loss: 5.856] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.724] [val/val_tokens_per_second: 345001.836] [val/loss_avg_len_2048: 5.856] [val/perplexity_len_2048: 349.308] [val/loss_avg_len_1024: 5.873] [val/perplexity_len_1024: 355.337] [val/loss_avg_len_512: 5.898] [val/perplexity_len_512: 364.266]
|
| 95 |
+
[2025-10-28 06:18:11][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:32:05] [ETA: 3:06:58] [loss: 5.871] [tokens/s: 160653.230] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 96 |
+
[2025-10-28 06:19:21][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:33:15] [ETA: 3:01:01] [loss: 5.832] [tokens/s: 176970.345] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000]
|
| 97 |
+
[2025-10-28 06:19:21][train:194][INFO] Running validation...
|
| 98 |
+
[2025-10-28 06:21:20][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 5595.524] [val/train_update_time: 3176.177] [val/loss: 5.820] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.232] [val/val_tokens_per_second: 343530.836] [val/loss_avg_len_2048: 5.820] [val/perplexity_len_2048: 337.058] [val/loss_avg_len_1024: 5.838] [val/perplexity_len_1024: 343.055] [val/loss_avg_len_512: 5.864] [val/perplexity_len_512: 352.054]
|
| 99 |
+
[2025-10-28 06:22:30][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:36:24] [ETA: 2:59:03] [loss: 5.805] [tokens/s: 160645.682] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 100 |
+
[2025-10-28 06:22:30][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5784.817] [train_eval/train_update_time: 3246.117] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.849] [train_eval/perplexity_len_2048: 346.889] [train_eval/loss_avg_len_1024: 5.864] [train_eval/perplexity_len_1024: 352.065] [train_eval/loss_avg_len_512: 5.886] [train_eval/perplexity_len_512: 360.116]
|
| 101 |
+
[2025-10-28 06:23:40][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:37:34] [ETA: 2:53:28] [loss: 5.737] [tokens/s: 177387.786] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 102 |
+
[2025-10-28 06:23:40][train:194][INFO] Running validation...
|
| 103 |
+
[2025-10-28 06:25:39][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 5854.861] [val/train_update_time: 3316.049] [val/loss: 5.786] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.498] [val/val_tokens_per_second: 345660.914] [val/loss_avg_len_2048: 5.786] [val/perplexity_len_2048: 325.726] [val/loss_avg_len_1024: 5.804] [val/perplexity_len_1024: 331.746] [val/loss_avg_len_512: 5.831] [val/perplexity_len_512: 340.795]
|
| 104 |
+
[2025-10-28 06:26:49][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:40:43] [ETA: 2:51:30] [loss: 5.770] [tokens/s: 161081.300] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 105 |
+
[2025-10-28 06:27:59][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:41:53] [ETA: 2:46:14] [loss: 5.743] [tokens/s: 177572.913] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 106 |
+
[2025-10-28 06:27:59][train:194][INFO] Running validation...
|
| 107 |
+
[2025-10-28 06:29:58][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 6113.463] [val/train_update_time: 3455.915] [val/loss: 5.760] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.623] [val/val_tokens_per_second: 345295.402] [val/loss_avg_len_2048: 5.760] [val/perplexity_len_2048: 317.312] [val/loss_avg_len_1024: 5.779] [val/perplexity_len_1024: 323.292] [val/loss_avg_len_512: 5.806] [val/perplexity_len_512: 332.440]
|
| 108 |
+
[2025-10-28 06:31:08][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:45:02] [ETA: 2:44:17] [loss: 5.759] [tokens/s: 161218.799] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 109 |
+
[2025-10-28 06:32:18][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:46:12] [ETA: 2:39:18] [loss: 5.662] [tokens/s: 177665.244] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 110 |
+
[2025-10-28 06:32:18][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6372.185] [train_eval/train_update_time: 3595.786] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.767] [train_eval/perplexity_len_2048: 319.631] [train_eval/loss_avg_len_1024: 5.781] [train_eval/perplexity_len_1024: 324.122] [train_eval/loss_avg_len_512: 5.805] [train_eval/perplexity_len_512: 332.034]
|
| 111 |
+
[2025-10-28 06:32:18][train:194][INFO] Running validation...
|
| 112 |
+
[2025-10-28 06:34:16][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 6372.185] [val/train_update_time: 3595.786] [val/loss: 5.730] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.578] [val/val_tokens_per_second: 345425.448] [val/loss_avg_len_2048: 5.730] [val/perplexity_len_2048: 307.986] [val/loss_avg_len_1024: 5.749] [val/perplexity_len_1024: 314.022] [val/loss_avg_len_512: 5.778] [val/perplexity_len_512: 323.269]
|
| 113 |
+
[2025-10-28 06:34:16][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000838860800.pt...
|
| 114 |
+
[2025-10-28 06:34:17][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000000838860800.pt.
|
| 115 |
+
[2025-10-28 06:34:17][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.604]
|
| 116 |
+
[2025-10-28 06:35:27][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:49:21] [ETA: 2:37:22] [loss: 5.682] [tokens/s: 161223.493] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 117 |
+
[2025-10-28 06:36:37][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:50:31] [ETA: 2:32:37] [loss: 5.698] [tokens/s: 177600.917] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 118 |
+
[2025-10-28 06:36:37][train:194][INFO] Running validation...
|
| 119 |
+
[2025-10-28 06:38:36][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 6631.452] [val/train_update_time: 3735.645] [val/loss: 5.707] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.904] [val/val_tokens_per_second: 344480.790] [val/loss_avg_len_2048: 5.707] [val/perplexity_len_2048: 301.034] [val/loss_avg_len_1024: 5.727] [val/perplexity_len_1024: 307.110] [val/loss_avg_len_512: 5.757] [val/perplexity_len_512: 316.405]
|
| 120 |
+
[2025-10-28 06:39:46][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:53:40] [ETA: 2:30:41] [loss: 5.693] [tokens/s: 161204.037] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 121 |
+
[2025-10-28 06:40:56][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:54:50] [ETA: 2:26:09] [loss: 5.727] [tokens/s: 177655.013] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 122 |
+
[2025-10-28 06:40:56][train:194][INFO] Running validation...
|
| 123 |
+
[2025-10-28 06:42:54][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 6890.436] [val/train_update_time: 3875.508] [val/loss: 5.684] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.320] [val/val_tokens_per_second: 346178.621] [val/loss_avg_len_2048: 5.684] [val/perplexity_len_2048: 294.055] [val/loss_avg_len_1024: 5.704] [val/perplexity_len_1024: 300.147] [val/loss_avg_len_512: 5.735] [val/perplexity_len_512: 309.552]
|
| 124 |
+
[2025-10-28 06:44:04][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:57:58] [ETA: 2:24:11] [loss: 5.670] [tokens/s: 161322.240] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 125 |
+
[2025-10-28 06:44:04][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7078.807] [train_eval/train_update_time: 3945.447] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.700] [train_eval/perplexity_len_2048: 298.773] [train_eval/loss_avg_len_1024: 5.718] [train_eval/perplexity_len_1024: 304.258] [train_eval/loss_avg_len_512: 5.748] [train_eval/perplexity_len_512: 313.427]
|
| 126 |
+
[2025-10-28 06:45:14][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:59:08] [ETA: 2:19:52] [loss: 5.644] [tokens/s: 177682.841] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 127 |
+
[2025-10-28 06:45:14][train:194][INFO] Running validation...
|
| 128 |
+
[2025-10-28 06:47:13][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 7148.851] [val/train_update_time: 4015.378] [val/loss: 5.663] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.155] [val/val_tokens_per_second: 346662.196] [val/loss_avg_len_2048: 5.663] [val/perplexity_len_2048: 288.120] [val/loss_avg_len_1024: 5.684] [val/perplexity_len_1024: 294.203] [val/loss_avg_len_512: 5.716] [val/perplexity_len_512: 303.632]
|
| 129 |
+
[2025-10-28 06:48:23][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 2:02:17] [ETA: 2:17:53] [loss: 5.657] [tokens/s: 161364.601] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 130 |
+
[2025-10-28 06:49:33][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 2:03:27] [ETA: 2:13:44] [loss: 5.642] [tokens/s: 177744.973] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 131 |
+
[2025-10-28 06:49:33][train:194][INFO] Running validation...
|
| 132 |
+
[2025-10-28 06:51:31][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 7407.170] [val/train_update_time: 4155.290] [val/loss: 5.643] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.502] [val/val_tokens_per_second: 345647.934] [val/loss_avg_len_2048: 5.643] [val/perplexity_len_2048: 282.384] [val/loss_avg_len_1024: 5.665] [val/perplexity_len_1024: 288.474] [val/loss_avg_len_512: 5.697] [val/perplexity_len_512: 297.980]
|
| 133 |
+
[2025-10-28 06:52:41][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 2:06:35] [ETA: 2:11:45] [loss: 5.637] [tokens/s: 161371.491] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 134 |
+
[2025-10-28 06:53:51][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 2:07:45] [ETA: 2:07:45] [loss: 5.624] [tokens/s: 177846.101] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 135 |
+
[2025-10-28 06:53:51][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7665.790] [train_eval/train_update_time: 4295.186] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.647] [train_eval/perplexity_len_2048: 283.438] [train_eval/loss_avg_len_1024: 5.664] [train_eval/perplexity_len_1024: 288.382] [train_eval/loss_avg_len_512: 5.696] [train_eval/perplexity_len_512: 297.565]
|
| 136 |
+
[2025-10-28 06:53:51][train:194][INFO] Running validation...
|
| 137 |
+
[2025-10-28 06:55:50][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 7665.790] [val/train_update_time: 4295.186] [val/loss: 5.623] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.537] [val/val_tokens_per_second: 345545.927] [val/loss_avg_len_2048: 5.623] [val/perplexity_len_2048: 276.635] [val/loss_avg_len_1024: 5.645] [val/perplexity_len_1024: 282.750] [val/loss_avg_len_512: 5.678] [val/perplexity_len_512: 292.337]
|
| 138 |
+
[2025-10-28 06:55:50][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001048576000.pt...
|
| 139 |
+
[2025-10-28 06:55:50][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001048576000.pt.
|
| 140 |
+
[2025-10-28 06:55:50][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.543]
|
| 141 |
+
[2025-10-28 06:57:01][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 2:10:54] [ETA: 2:05:46] [loss: 5.619] [tokens/s: 161381.450] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 142 |
+
[2025-10-28 06:58:11][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 2:12:04] [ETA: 2:01:55] [loss: 5.616] [tokens/s: 177814.118] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 143 |
+
[2025-10-28 06:58:11][train:194][INFO] Running validation...
|
| 144 |
+
[2025-10-28 07:00:09][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 7924.985] [val/train_update_time: 4435.058] [val/loss: 5.605] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.866] [val/val_tokens_per_second: 344589.614] [val/loss_avg_len_2048: 5.605] [val/perplexity_len_2048: 271.704] [val/loss_avg_len_1024: 5.627] [val/perplexity_len_1024: 277.840] [val/loss_avg_len_512: 5.661] [val/perplexity_len_512: 287.461]
|
| 145 |
+
[2025-10-28 07:01:20][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 2:15:13] [ETA: 1:59:55] [loss: 5.587] [tokens/s: 161381.358] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 146 |
+
[2025-10-28 07:02:30][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 2:16:23] [ETA: 1:56:11] [loss: 5.596] [tokens/s: 177722.933] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 147 |
+
[2025-10-28 07:02:30][train:194][INFO] Running validation...
|
| 148 |
+
[2025-10-28 07:04:28][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 8183.985] [val/train_update_time: 4574.958] [val/loss: 5.590] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.373] [val/val_tokens_per_second: 346026.041] [val/loss_avg_len_2048: 5.590] [val/perplexity_len_2048: 267.758] [val/loss_avg_len_1024: 5.613] [val/perplexity_len_1024: 273.949] [val/loss_avg_len_512: 5.648] [val/perplexity_len_512: 283.660]
|
| 149 |
+
[2025-10-28 07:05:38][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 2:19:32] [ETA: 1:54:10] [loss: 5.539] [tokens/s: 161368.454] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 150 |
+
[2025-10-28 07:05:38][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8372.430] [train_eval/train_update_time: 4644.910] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.597] [train_eval/perplexity_len_2048: 269.687] [train_eval/loss_avg_len_1024: 5.614] [train_eval/perplexity_len_1024: 274.126] [train_eval/loss_avg_len_512: 5.644] [train_eval/perplexity_len_512: 282.581]
|
| 151 |
+
[2025-10-28 07:06:48][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 2:20:42] [ETA: 1:50:33] [loss: 5.579] [tokens/s: 177684.218] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 152 |
+
[2025-10-28 07:06:48][train:194][INFO] Running validation...
|
| 153 |
+
[2025-10-28 07:08:47][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 8442.486] [val/train_update_time: 4714.853] [val/loss: 5.574] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.529] [val/val_tokens_per_second: 345569.440] [val/loss_avg_len_2048: 5.574] [val/perplexity_len_2048: 263.571] [val/loss_avg_len_1024: 5.598] [val/perplexity_len_1024: 269.753] [val/loss_avg_len_512: 5.633] [val/perplexity_len_512: 279.508]
|
| 154 |
+
[2025-10-28 07:09:57][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 2:23:51] [ETA: 1:48:31] [loss: 5.533] [tokens/s: 161319.510] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 155 |
+
[2025-10-28 07:11:07][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 2:25:01] [ETA: 1:45:00] [loss: 5.593] [tokens/s: 177683.161] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 156 |
+
[2025-10-28 07:11:07][train:194][INFO] Running validation...
|
| 157 |
+
[2025-10-28 07:13:05][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 8701.162] [val/train_update_time: 4854.753] [val/loss: 5.560] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.597] [val/val_tokens_per_second: 345370.152] [val/loss_avg_len_2048: 5.560] [val/perplexity_len_2048: 259.857] [val/loss_avg_len_1024: 5.584] [val/perplexity_len_1024: 266.071] [val/loss_avg_len_512: 5.620] [val/perplexity_len_512: 275.967]
|
| 158 |
+
[2025-10-28 07:14:15][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 2:28:09] [ETA: 1:42:57] [loss: 5.593] [tokens/s: 161307.065] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 159 |
+
[2025-10-28 07:15:26][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 2:29:19] [ETA: 1:39:33] [loss: 5.584] [tokens/s: 177749.840] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 160 |
+
[2025-10-28 07:15:26][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8959.925] [train_eval/train_update_time: 4994.663] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.558] [train_eval/perplexity_len_2048: 259.341] [train_eval/loss_avg_len_1024: 5.575] [train_eval/perplexity_len_1024: 263.874] [train_eval/loss_avg_len_512: 5.609] [train_eval/perplexity_len_512: 272.868]
|
| 161 |
+
[2025-10-28 07:15:26][train:194][INFO] Running validation...
|
| 162 |
+
[2025-10-28 07:17:25][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 8959.925] [val/train_update_time: 4994.663] [val/loss: 5.547] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.351] [val/val_tokens_per_second: 343189.523] [val/loss_avg_len_2048: 5.547] [val/perplexity_len_2048: 256.395] [val/loss_avg_len_1024: 5.571] [val/perplexity_len_1024: 262.651] [val/loss_avg_len_512: 5.608] [val/perplexity_len_512: 272.576]
|
| 163 |
+
[2025-10-28 07:17:25][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001258291200.pt...
|
| 164 |
+
[2025-10-28 07:17:25][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001258291200.pt.
|
| 165 |
+
[2025-10-28 07:17:25][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.559]
|
| 166 |
+
[2025-10-28 07:18:36][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 2:32:29] [ETA: 1:37:29] [loss: 5.559] [tokens/s: 161196.762] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 167 |
+
[2025-10-28 07:19:46][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 2:33:40] [ETA: 1:34:10] [loss: 5.528] [tokens/s: 177582.347] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 168 |
+
[2025-10-28 07:19:46][train:194][INFO] Running validation...
|
| 169 |
+
[2025-10-28 07:21:44][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 9220.005] [val/train_update_time: 5134.569] [val/loss: 5.535] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.741] [val/val_tokens_per_second: 344952.514] [val/loss_avg_len_2048: 5.535] [val/perplexity_len_2048: 253.336] [val/loss_avg_len_1024: 5.559] [val/perplexity_len_1024: 259.605] [val/loss_avg_len_512: 5.597] [val/perplexity_len_512: 269.570]
|
| 170 |
+
[2025-10-28 07:22:54][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 2:36:48] [ETA: 1:32:05] [loss: 5.550] [tokens/s: 161206.714] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 171 |
+
[2025-10-28 07:24:05][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:37:58] [ETA: 1:28:51] [loss: 5.538] [tokens/s: 177521.301] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 172 |
+
[2025-10-28 07:24:05][train:194][INFO] Running validation...
|
| 173 |
+
[2025-10-28 07:26:03][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 9478.915] [val/train_update_time: 5274.478] [val/loss: 5.524] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.794] [val/val_tokens_per_second: 344799.429] [val/loss_avg_len_2048: 5.524] [val/perplexity_len_2048: 250.611] [val/loss_avg_len_1024: 5.549] [val/perplexity_len_1024: 256.925] [val/loss_avg_len_512: 5.587] [val/perplexity_len_512: 266.969]
|
| 174 |
+
[2025-10-28 07:27:13][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:41:07] [ETA: 1:26:45] [loss: 5.505] [tokens/s: 161149.799] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 175 |
+
[2025-10-28 07:27:13][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9667.798] [train_eval/train_update_time: 5344.432] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.535] [train_eval/perplexity_len_2048: 253.349] [train_eval/loss_avg_len_1024: 5.558] [train_eval/perplexity_len_1024: 259.206] [train_eval/loss_avg_len_512: 5.592] [train_eval/perplexity_len_512: 268.353]
|
| 176 |
+
[2025-10-28 07:28:23][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:42:17] [ETA: 1:23:36] [loss: 5.532] [tokens/s: 177473.598] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 177 |
+
[2025-10-28 07:28:23][train:194][INFO] Running validation...
|
| 178 |
+
[2025-10-28 07:30:22][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 9737.888] [val/train_update_time: 5414.394] [val/loss: 5.514] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.890] [val/val_tokens_per_second: 344518.785] [val/loss_avg_len_2048: 5.514] [val/perplexity_len_2048: 248.078] [val/loss_avg_len_1024: 5.539] [val/perplexity_len_1024: 254.381] [val/loss_avg_len_512: 5.578] [val/perplexity_len_512: 264.476]
|
| 179 |
+
[2025-10-28 07:31:32][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 2:45:26] [ETA: 1:21:29] [loss: 5.486] [tokens/s: 161097.977] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 180 |
+
[2025-10-28 07:32:43][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 2:46:36] [ETA: 1:18:24] [loss: 5.502] [tokens/s: 177423.386] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 181 |
+
[2025-10-28 07:32:43][train:194][INFO] Running validation...
|
| 182 |
+
[2025-10-28 07:34:41][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 9996.966] [val/train_update_time: 5554.316] [val/loss: 5.504] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.394] [val/val_tokens_per_second: 345963.792] [val/loss_avg_len_2048: 5.504] [val/perplexity_len_2048: 245.566] [val/loss_avg_len_1024: 5.529] [val/perplexity_len_1024: 251.922] [val/loss_avg_len_512: 5.569] [val/perplexity_len_512: 262.109]
|
| 183 |
+
[2025-10-28 07:35:51][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 2:49:45] [ETA: 1:16:16] [loss: 5.512] [tokens/s: 161120.440] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 184 |
+
[2025-10-28 07:37:01][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 2:50:55] [ETA: 1:13:15] [loss: 5.508] [tokens/s: 177649.813] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 185 |
+
[2025-10-28 07:37:01][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10255.549] [train_eval/train_update_time: 5694.240] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.504] [train_eval/perplexity_len_2048: 245.735] [train_eval/loss_avg_len_1024: 5.527] [train_eval/perplexity_len_1024: 251.494] [train_eval/loss_avg_len_512: 5.567] [train_eval/perplexity_len_512: 261.543]
|
| 186 |
+
[2025-10-28 07:37:01][train:194][INFO] Running validation...
|
| 187 |
+
[2025-10-28 07:39:00][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 10255.549] [val/train_update_time: 5694.240] [val/loss: 5.495] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.524] [val/val_tokens_per_second: 345583.861] [val/loss_avg_len_2048: 5.495] [val/perplexity_len_2048: 243.565] [val/loss_avg_len_1024: 5.521] [val/perplexity_len_1024: 249.950] [val/loss_avg_len_512: 5.561] [val/perplexity_len_512: 260.184]
|
| 188 |
+
[2025-10-28 07:39:00][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001468006400.pt...
|
| 189 |
+
[2025-10-28 07:39:00][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001468006400.pt.
|
| 190 |
+
[2025-10-28 07:39:00][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.553]
|
| 191 |
+
[2025-10-28 07:40:10][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:54:04] [ETA: 1:11:06] [loss: 5.497] [tokens/s: 161223.245] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 192 |
+
[2025-10-28 07:41:20][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:55:14] [ETA: 1:08:09] [loss: 5.489] [tokens/s: 177600.962] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 193 |
+
[2025-10-28 07:41:20][train:194][INFO] Running validation...
|
| 194 |
+
[2025-10-28 07:43:19][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 10514.781] [val/train_update_time: 5834.131] [val/loss: 5.487] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.976] [val/val_tokens_per_second: 344270.742] [val/loss_avg_len_2048: 5.487] [val/perplexity_len_2048: 241.540] [val/loss_avg_len_1024: 5.513] [val/perplexity_len_1024: 247.922] [val/loss_avg_len_512: 5.554] [val/perplexity_len_512: 258.172]
|
| 195 |
+
[2025-10-28 07:44:29][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:58:23] [ETA: 1:05:58] [loss: 5.506] [tokens/s: 161195.843] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 196 |
+
[2025-10-28 07:45:40][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:59:33] [ETA: 1:03:05] [loss: 5.501] [tokens/s: 177576.339] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 197 |
+
[2025-10-28 07:45:40][train:194][INFO] Running validation...
|
| 198 |
+
[2025-10-28 07:47:39][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 10773.905] [val/train_update_time: 5974.028] [val/loss: 5.480] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.072] [val/val_tokens_per_second: 343994.445] [val/loss_avg_len_2048: 5.480] [val/perplexity_len_2048: 239.927] [val/loss_avg_len_1024: 5.507] [val/perplexity_len_1024: 246.330] [val/loss_avg_len_512: 5.548] [val/perplexity_len_512: 256.613]
|
| 199 |
+
[2025-10-28 07:48:49][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 3:02:43] [ETA: 1:00:54] [loss: 5.496] [tokens/s: 161165.619] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 200 |
+
[2025-10-28 07:48:49][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10963.046] [train_eval/train_update_time: 6043.979] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.485] [train_eval/perplexity_len_2048: 241.052] [train_eval/loss_avg_len_1024: 5.509] [train_eval/perplexity_len_1024: 246.929] [train_eval/loss_avg_len_512: 5.548] [train_eval/perplexity_len_512: 256.828]
|
| 201 |
+
[2025-10-28 07:49:59][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 3:03:53] [ETA: 0:58:04] [loss: 5.435] [tokens/s: 177556.099] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 202 |
+
[2025-10-28 07:49:59][train:194][INFO] Running validation...
|
| 203 |
+
[2025-10-28 07:51:58][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 11033.109] [val/train_update_time: 6113.922] [val/loss: 5.474] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.073] [val/val_tokens_per_second: 343991.270] [val/loss_avg_len_2048: 5.474] [val/perplexity_len_2048: 238.456] [val/loss_avg_len_1024: 5.501] [val/perplexity_len_1024: 244.888] [val/loss_avg_len_512: 5.542] [val/perplexity_len_512: 255.240]
|
| 204 |
+
[2025-10-28 07:53:08][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 3:07:02] [ETA: 0:55:52] [loss: 5.515] [tokens/s: 161147.907] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 205 |
+
[2025-10-28 07:54:18][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 3:08:12] [ETA: 0:53:05] [loss: 5.426] [tokens/s: 177461.509] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 206 |
+
[2025-10-28 07:54:18][train:194][INFO] Running validation...
|
| 207 |
+
[2025-10-28 07:56:16][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 11292.313] [val/train_update_time: 6253.814] [val/loss: 5.468] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.412] [val/val_tokens_per_second: 345911.180] [val/loss_avg_len_2048: 5.468] [val/perplexity_len_2048: 237.079] [val/loss_avg_len_1024: 5.495] [val/perplexity_len_1024: 243.509] [val/loss_avg_len_512: 5.537] [val/perplexity_len_512: 253.897]
|
| 208 |
+
[2025-10-28 07:57:26][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 3:11:20] [ETA: 0:50:51] [loss: 5.461] [tokens/s: 161153.387] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 209 |
+
[2025-10-28 07:58:36][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 3:12:30] [ETA: 0:48:07] [loss: 5.436] [tokens/s: 177570.720] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 210 |
+
[2025-10-28 07:58:36][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 11550.860] [train_eval/train_update_time: 6393.711] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.469] [train_eval/perplexity_len_2048: 237.322] [train_eval/loss_avg_len_1024: 5.496] [train_eval/perplexity_len_1024: 243.612] [train_eval/loss_avg_len_512: 5.535] [train_eval/perplexity_len_512: 253.363]
|
| 211 |
+
[2025-10-28 07:58:36][train:194][INFO] Running validation...
|
| 212 |
+
[2025-10-28 08:00:35][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 11550.860] [val/train_update_time: 6393.711] [val/loss: 5.464] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.559] [val/val_tokens_per_second: 345481.948] [val/loss_avg_len_2048: 5.464] [val/perplexity_len_2048: 235.953] [val/loss_avg_len_1024: 5.491] [val/perplexity_len_1024: 242.389] [val/loss_avg_len_512: 5.533] [val/perplexity_len_512: 252.785]
|
| 213 |
+
[2025-10-28 08:00:35][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001677721600.pt...
|
| 214 |
+
[2025-10-28 08:00:36][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001677721600.pt.
|
| 215 |
+
[2025-10-28 08:00:36][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.549]
|
| 216 |
+
[2025-10-28 08:01:46][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 3:15:40] [ETA: 0:45:53] [loss: 5.406] [tokens/s: 161154.567] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 217 |
+
[2025-10-28 08:02:56][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 3:16:50] [ETA: 0:43:12] [loss: 5.442] [tokens/s: 177553.144] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 218 |
+
[2025-10-28 08:02:56][train:194][INFO] Running validation...
|
| 219 |
+
[2025-10-28 08:04:54][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 11810.108] [val/train_update_time: 6533.614] [val/loss: 5.459] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.309] [val/val_tokens_per_second: 346211.923] [val/loss_avg_len_2048: 5.459] [val/perplexity_len_2048: 234.963] [val/loss_avg_len_1024: 5.487] [val/perplexity_len_1024: 241.416] [val/loss_avg_len_512: 5.529] [val/perplexity_len_512: 251.811]
|
| 220 |
+
[2025-10-28 08:06:04][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 3:19:58] [ETA: 0:40:57] [loss: 5.469] [tokens/s: 161239.906] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 221 |
+
[2025-10-28 08:07:14][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 3:21:08] [ETA: 0:38:18] [loss: 5.416] [tokens/s: 177669.766] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 222 |
+
[2025-10-28 08:07:14][train:194][INFO] Running validation...
|
| 223 |
+
[2025-10-28 08:09:13][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 12068.559] [val/train_update_time: 6673.514] [val/loss: 5.456] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.485] [val/val_tokens_per_second: 345697.539] [val/loss_avg_len_2048: 5.456] [val/perplexity_len_2048: 234.114] [val/loss_avg_len_1024: 5.483] [val/perplexity_len_1024: 240.583] [val/loss_avg_len_512: 5.526] [val/perplexity_len_512: 251.032]
|
| 224 |
+
[2025-10-28 08:10:23][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 3:24:17] [ETA: 0:36:03] [loss: 5.501] [tokens/s: 161313.112] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 225 |
+
[2025-10-28 08:10:23][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 12257.112] [train_eval/train_update_time: 6743.462] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.451] [train_eval/perplexity_len_2048: 232.879] [train_eval/loss_avg_len_1024: 5.470] [train_eval/perplexity_len_1024: 237.552] [train_eval/loss_avg_len_512: 5.510] [train_eval/perplexity_len_512: 247.272]
|
| 226 |
+
[2025-10-28 08:11:33][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 3:25:27] [ETA: 0:33:26] [loss: 5.469] [tokens/s: 177757.939] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 227 |
+
[2025-10-28 08:11:33][train:194][INFO] Running validation...
|
| 228 |
+
[2025-10-28 08:13:31][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 12327.183] [val/train_update_time: 6813.415] [val/loss: 5.453] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.348] [val/val_tokens_per_second: 346097.247] [val/loss_avg_len_2048: 5.453] [val/perplexity_len_2048: 233.464] [val/loss_avg_len_1024: 5.480] [val/perplexity_len_1024: 239.935] [val/loss_avg_len_512: 5.523] [val/perplexity_len_512: 250.383]
|
| 229 |
+
[2025-10-28 08:14:41][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 3:28:35] [ETA: 0:31:10] [loss: 5.406] [tokens/s: 161403.516] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 230 |
+
[2025-10-28 08:15:51][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 3:29:45] [ETA: 0:28:36] [loss: 5.437] [tokens/s: 177768.473] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 231 |
+
[2025-10-28 08:15:51][train:194][INFO] Running validation...
|
| 232 |
+
[2025-10-28 08:17:50][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 12585.657] [val/train_update_time: 6953.311] [val/loss: 5.451] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.068] [val/val_tokens_per_second: 344004.024] [val/loss_avg_len_2048: 5.451] [val/perplexity_len_2048: 232.918] [val/loss_avg_len_1024: 5.478] [val/perplexity_len_1024: 239.395] [val/loss_avg_len_512: 5.521] [val/perplexity_len_512: 249.859]
|
| 233 |
+
[2025-10-28 08:19:00][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 3:32:54] [ETA: 0:26:18] [loss: 5.503] [tokens/s: 161318.513] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 234 |
+
[2025-10-28 08:20:10][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 3:34:04] [ETA: 0:23:47] [loss: 5.398] [tokens/s: 177769.956] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 235 |
+
[2025-10-28 08:20:10][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 12844.892] [train_eval/train_update_time: 7093.223] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.450] [train_eval/perplexity_len_2048: 232.751] [train_eval/loss_avg_len_1024: 5.474] [train_eval/perplexity_len_1024: 238.384] [train_eval/loss_avg_len_512: 5.515] [train_eval/perplexity_len_512: 248.290]
|
| 236 |
+
[2025-10-28 08:20:11][train:194][INFO] Running validation...
|
| 237 |
+
[2025-10-28 08:22:10][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 12844.892] [val/train_update_time: 7093.223] [val/loss: 5.449] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.423] [val/val_tokens_per_second: 342982.810] [val/loss_avg_len_2048: 5.449] [val/perplexity_len_2048: 232.502] [val/loss_avg_len_1024: 5.476] [val/perplexity_len_1024: 238.977] [val/loss_avg_len_512: 5.519] [val/perplexity_len_512: 249.452]
|
| 238 |
+
[2025-10-28 08:22:10][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001887436800.pt...
|
| 239 |
+
[2025-10-28 08:22:10][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_3_4_256/checkpoints/step-000001887436800.pt.
|
| 240 |
+
[2025-10-28 08:22:10][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.558]
|
| 241 |
+
[2025-10-28 08:23:21][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 3:37:14] [ETA: 0:21:29] [loss: 5.444] [tokens/s: 161203.830] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 242 |
+
[2025-10-28 08:24:31][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 3:38:25] [ETA: 0:18:59] [loss: 5.479] [tokens/s: 177510.209] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 243 |
+
[2025-10-28 08:24:31][train:194][INFO] Running validation...
|
| 244 |
+
[2025-10-28 08:26:30][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 13105.045] [val/train_update_time: 7233.126] [val/loss: 5.448] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.240] [val/val_tokens_per_second: 343509.294] [val/loss_avg_len_2048: 5.448] [val/perplexity_len_2048: 232.214] [val/loss_avg_len_1024: 5.475] [val/perplexity_len_1024: 238.695] [val/loss_avg_len_512: 5.518] [val/perplexity_len_512: 249.166]
|
| 245 |
+
[2025-10-28 08:27:40][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 3:41:34] [ETA: 0:16:40] [loss: 5.456] [tokens/s: 161084.541] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 246 |
+
[2025-10-28 08:28:50][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 3:42:44] [ETA: 0:14:13] [loss: 5.403] [tokens/s: 177392.839] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 247 |
+
[2025-10-28 08:28:50][train:194][INFO] Running validation...
|
| 248 |
+
[2025-10-28 08:30:49][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 13364.449] [val/train_update_time: 7373.020] [val/loss: 5.447] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.281] [val/val_tokens_per_second: 343392.107] [val/loss_avg_len_2048: 5.447] [val/perplexity_len_2048: 232.003] [val/loss_avg_len_1024: 5.474] [val/perplexity_len_1024: 238.482] [val/loss_avg_len_512: 5.517] [val/perplexity_len_512: 248.962]
|
| 249 |
+
[2025-10-28 08:31:59][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 3:45:53] [ETA: 0:11:53] [loss: 5.440] [tokens/s: 160983.051] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 250 |
+
[2025-10-28 08:31:59][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 13553.814] [train_eval/train_update_time: 7442.970] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.440] [train_eval/perplexity_len_2048: 230.442] [train_eval/loss_avg_len_1024: 5.467] [train_eval/perplexity_len_1024: 236.693] [train_eval/loss_avg_len_512: 5.507] [train_eval/perplexity_len_512: 246.395]
|
| 251 |
+
[2025-10-28 08:33:10][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 3:47:03] [ETA: 0:09:27] [loss: 5.426] [tokens/s: 177246.769] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 252 |
+
[2025-10-28 08:33:10][train:194][INFO] Running validation...
|
| 253 |
+
[2025-10-28 08:35:08][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 13623.900] [val/train_update_time: 7512.918] [val/loss: 5.446] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 118.658] [val/val_tokens_per_second: 345194.412] [val/loss_avg_len_2048: 5.446] [val/perplexity_len_2048: 231.892] [val/loss_avg_len_1024: 5.474] [val/perplexity_len_1024: 238.373] [val/loss_avg_len_512: 5.517] [val/perplexity_len_512: 248.862]
|
| 254 |
+
[2025-10-28 08:36:18][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 3:50:12] [ETA: 0:07:07] [loss: 5.463] [tokens/s: 160942.421] [batches/s: 0.077] [MFU: 0.000] [TFLOPS: 0.000]
|
| 255 |
+
[2025-10-28 08:37:28][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 3:51:22] [ETA: 0:04:43] [loss: 5.444] [tokens/s: 177308.988] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000]
|
| 256 |
+
[2025-10-28 08:37:28][train:194][INFO] Running validation...
|
| 257 |
+
[2025-10-28 08:39:28][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 13882.690] [val/train_update_time: 7652.812] [val/loss: 5.446] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 119.432] [val/val_tokens_per_second: 342955.416] [val/loss_avg_len_2048: 5.446] [val/perplexity_len_2048: 231.842] [val/loss_avg_len_1024: 5.474] [val/perplexity_len_1024: 238.322] [val/loss_avg_len_512: 5.517] [val/perplexity_len_512: 248.809]
|
| 258 |
+
[2025-10-28 08:39:28][train:854][INFO] Training finished with 2055208960 tokens!
|
metrics/jsonlines/checkpoint.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"step": 209715200, "checkpoint/checkpoint_time": 0.
|
| 2 |
-
{"step": 419430400, "checkpoint/checkpoint_time": 0.
|
| 3 |
-
{"step": 629145600, "checkpoint/checkpoint_time": 0.
|
| 4 |
-
{"step": 838860800, "checkpoint/checkpoint_time": 0.
|
| 5 |
-
{"step": 1048576000, "checkpoint/checkpoint_time": 0.
|
| 6 |
-
{"step": 1258291200, "checkpoint/checkpoint_time": 0.
|
| 7 |
-
{"step": 1468006400, "checkpoint/checkpoint_time": 0.
|
| 8 |
-
{"step": 1677721600, "checkpoint/checkpoint_time": 0.
|
| 9 |
-
{"step": 1887436800, "checkpoint/checkpoint_time": 0.
|
|
|
|
| 1 |
+
{"step": 209715200, "checkpoint/checkpoint_time": 0.4577103740302846}
|
| 2 |
+
{"step": 419430400, "checkpoint/checkpoint_time": 0.4446265029255301}
|
| 3 |
+
{"step": 629145600, "checkpoint/checkpoint_time": 0.5438607160467654}
|
| 4 |
+
{"step": 838860800, "checkpoint/checkpoint_time": 0.6043127420125529}
|
| 5 |
+
{"step": 1048576000, "checkpoint/checkpoint_time": 0.5425283389631659}
|
| 6 |
+
{"step": 1258291200, "checkpoint/checkpoint_time": 0.5587757179746404}
|
| 7 |
+
{"step": 1468006400, "checkpoint/checkpoint_time": 0.5533553039422259}
|
| 8 |
+
{"step": 1677721600, "checkpoint/checkpoint_time": 0.5485877189785242}
|
| 9 |
+
{"step": 1887436800, "checkpoint/checkpoint_time": 0.5579239659709856}
|
metrics/jsonlines/norm.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/throughput.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/train.jsonl
CHANGED
|
@@ -1,98 +1,98 @@
|
|
| 1 |
-
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time":
|
| 2 |
-
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time":
|
| 3 |
-
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time":
|
| 4 |
-
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time":
|
| 5 |
-
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time":
|
| 6 |
-
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time":
|
| 7 |
-
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time":
|
| 8 |
-
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time":
|
| 9 |
-
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time":
|
| 10 |
-
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time":
|
| 11 |
-
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time":
|
| 12 |
-
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time":
|
| 13 |
-
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time":
|
| 14 |
-
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time":
|
| 15 |
-
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time":
|
| 16 |
-
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time":
|
| 17 |
-
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time":
|
| 18 |
-
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time":
|
| 19 |
-
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time":
|
| 20 |
-
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time":
|
| 21 |
-
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time":
|
| 22 |
-
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time":
|
| 23 |
-
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time":
|
| 24 |
-
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time":
|
| 25 |
-
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time":
|
| 26 |
-
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time":
|
| 27 |
-
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time":
|
| 28 |
-
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time":
|
| 29 |
-
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time":
|
| 30 |
-
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time":
|
| 31 |
-
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time":
|
| 32 |
-
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time":
|
| 33 |
-
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time":
|
| 34 |
-
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time":
|
| 35 |
-
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time":
|
| 36 |
-
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time":
|
| 37 |
-
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time":
|
| 38 |
-
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time":
|
| 39 |
-
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time":
|
| 40 |
-
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time":
|
| 41 |
-
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time":
|
| 42 |
-
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time":
|
| 43 |
-
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time":
|
| 44 |
-
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time":
|
| 45 |
-
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time":
|
| 46 |
-
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time":
|
| 47 |
-
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time":
|
| 48 |
-
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time":
|
| 49 |
-
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time":
|
| 50 |
-
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time":
|
| 51 |
-
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time":
|
| 52 |
-
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time":
|
| 53 |
-
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time":
|
| 54 |
-
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time":
|
| 55 |
-
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time":
|
| 56 |
-
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time":
|
| 57 |
-
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time":
|
| 58 |
-
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time":
|
| 59 |
-
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time":
|
| 60 |
-
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time":
|
| 61 |
-
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time":
|
| 62 |
-
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time":
|
| 63 |
-
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time":
|
| 64 |
-
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time":
|
| 65 |
-
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time":
|
| 66 |
-
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time":
|
| 67 |
-
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time":
|
| 68 |
-
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time":
|
| 69 |
-
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time":
|
| 70 |
-
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time":
|
| 71 |
-
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time":
|
| 72 |
-
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time":
|
| 73 |
-
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time":
|
| 74 |
-
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time":
|
| 75 |
-
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time":
|
| 76 |
-
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time":
|
| 77 |
-
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time":
|
| 78 |
-
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time":
|
| 79 |
-
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time":
|
| 80 |
-
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time":
|
| 81 |
-
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time":
|
| 82 |
-
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time":
|
| 83 |
-
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time":
|
| 84 |
-
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time":
|
| 85 |
-
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time":
|
| 86 |
-
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time":
|
| 87 |
-
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time":
|
| 88 |
-
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time":
|
| 89 |
-
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time":
|
| 90 |
-
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time":
|
| 91 |
-
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time":
|
| 92 |
-
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time":
|
| 93 |
-
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time":
|
| 94 |
-
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time":
|
| 95 |
-
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time":
|
| 96 |
-
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time":
|
| 97 |
-
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time":
|
| 98 |
-
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time":
|
|
|
|
| 1 |
+
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 212.5578491949709, "train/update_time": 212.33471676090267, "train/lr": 0.0009000000000000001, "train/loss": 9.77643871307373, "train/global_grad_norm": 1.2205885648727417}
|
| 2 |
+
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 371.2564815880032, "train/update_time": 370.74722005974036, "train/lr": 0.0009997960964140947, "train/loss": 8.174598693847656, "train/global_grad_norm": 0.9826189875602722}
|
| 3 |
+
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 838.406655976898, "train/update_time": 569.1370019397, "train/lr": 0.0009990914580222257, "train/loss": 7.701515197753906, "train/global_grad_norm": 0.39929941296577454}
|
| 4 |
+
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 989.879293700913, "train/update_time": 720.4508380297339, "train/lr": 0.0009978842768382998, "train/loss": 7.45575475692749, "train/global_grad_norm": 0.1761881709098816}
|
| 5 |
+
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 1454.5109360769857, "train/update_time": 899.2642310586525, "train/lr": 0.0009961757683914405, "train/loss": 7.304540157318115, "train/global_grad_norm": 0.28032752871513367}
|
| 6 |
+
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 1606.152420823928, "train/update_time": 1050.7343851425685, "train/lr": 0.00099396765300483, "train/loss": 7.139791488647461, "train/global_grad_norm": 0.18078835308551788}
|
| 7 |
+
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 2066.870162793901, "train/update_time": 1206.4585170837818, "train/lr": 0.0009912621540634887, "train/loss": 7.047938346862793, "train/global_grad_norm": 0.3235696852207184}
|
| 8 |
+
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 2218.4671919340035, "train/update_time": 1357.900893958984, "train/lr": 0.000988061995775515, "train/loss": 6.9367804527282715, "train/global_grad_norm": 0.257102906703949}
|
| 9 |
+
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 2411.522138212924, "train/update_time": 1427.838058966794, "train/lr": 0.0009843704004290394, "train/loss": 6.816991329193115, "train/global_grad_norm": 0.4563130736351013}
|
| 10 |
+
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 2481.5876440349966, "train/update_time": 1497.7818741976516, "train/lr": 0.0009801910851476522, "train/loss": 6.740283012390137, "train/global_grad_norm": 0.30055639147758484}
|
| 11 |
+
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 2670.7504822599003, "train/update_time": 1567.7054013966117, "train/lr": 0.0009755282581475768, "train/loss": 6.661011695861816, "train/global_grad_norm": 0.5079889893531799}
|
| 12 |
+
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 2740.80689570494, "train/update_time": 1637.629011878511, "train/lr": 0.0009703866145003512, "train/loss": 6.556557655334473, "train/global_grad_norm": 0.3374536633491516}
|
| 13 |
+
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 2929.6757837759797, "train/update_time": 1707.567629359779, "train/lr": 0.0009647713314052896, "train/loss": 6.5208539962768555, "train/global_grad_norm": 0.22820784151554108}
|
| 14 |
+
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 2999.716264599003, "train/update_time": 1777.492755148909, "train/lr": 0.0009586880629764817, "train/loss": 6.479208946228027, "train/global_grad_norm": 0.46022579073905945}
|
| 15 |
+
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 3188.29143246694, "train/update_time": 1847.4134167138254, "train/lr": 0.0009521429345495787, "train/loss": 6.371499538421631, "train/global_grad_norm": 0.29645684361457825}
|
| 16 |
+
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 3258.342541063903, "train/update_time": 1917.3395090608392, "train/lr": 0.0009451425365140996, "train/loss": 6.371063232421875, "train/global_grad_norm": 0.9106816053390503}
|
| 17 |
+
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 3447.5834293199005, "train/update_time": 1987.271274490864, "train/lr": 0.000937693917677468, "train/loss": 6.2678022384643555, "train/global_grad_norm": 0.36450543999671936}
|
| 18 |
+
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 3517.6349648769246, "train/update_time": 2057.1937958986964, "train/lr": 0.0009298045781674596, "train/loss": 6.241225242614746, "train/global_grad_norm": 0.30290958285331726}
|
| 19 |
+
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 3706.861982131959, "train/update_time": 2127.122993543744, "train/lr": 0.0009214824618802108, "train/loss": 6.248615264892578, "train/global_grad_norm": 0.4511157274246216}
|
| 20 |
+
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 3776.9122951189056, "train/update_time": 2197.046115837642, "train/lr": 0.000912735948481387, "train/loss": 6.1730451583862305, "train/global_grad_norm": 0.5304612517356873}
|
| 21 |
+
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 3966.7439867819194, "train/update_time": 2266.9816121864133, "train/lr": 0.0009035738449685707, "train/loss": 6.1212158203125, "train/global_grad_norm": 0.3544110357761383}
|
| 22 |
+
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 4036.799753597006, "train/update_time": 2336.9093162972713, "train/lr": 0.0008940053768033609, "train/loss": 6.1092681884765625, "train/global_grad_norm": 0.4344483017921448}
|
| 23 |
+
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 4226.125323860906, "train/update_time": 2406.836813273141, "train/lr": 0.0008840401786221159, "train/loss": 6.067781448364258, "train/global_grad_norm": 0.3917176425457001}
|
| 24 |
+
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 4296.169194732909, "train/update_time": 2476.7631880298723, "train/lr": 0.0008736882845346905, "train/loss": 6.019988536834717, "train/global_grad_norm": 0.422320157289505}
|
| 25 |
+
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 4485.399678221904, "train/update_time": 2546.7019683559192, "train/lr": 0.0008629601180209381, "train/loss": 6.009823322296143, "train/global_grad_norm": 0.37123939394950867}
|
| 26 |
+
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 4555.462784307892, "train/update_time": 2616.633614034741, "train/lr": 0.0008518664814351503, "train/loss": 5.976399898529053, "train/global_grad_norm": 0.415222704410553}
|
| 27 |
+
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 4747.504722302896, "train/update_time": 2686.5944050095277, "train/lr": 0.0008404185451290017, "train/loss": 5.970578670501709, "train/global_grad_norm": 0.41021960973739624}
|
| 28 |
+
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 4817.594658994931, "train/update_time": 2756.5458030648297, "train/lr": 0.0008286278362039527, "train/loss": 5.917147636413574, "train/global_grad_norm": 0.392586886882782}
|
| 29 |
+
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 5007.321647593984, "train/update_time": 2826.489245740697, "train/lr": 0.0008165062269044352, "train/loss": 5.883110046386719, "train/global_grad_norm": 0.40099892020225525}
|
| 30 |
+
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 5077.3983478549635, "train/update_time": 2896.4371623727493, "train/lr": 0.0008040659226635089, "train/loss": 5.869299411773682, "train/global_grad_norm": 0.3332286477088928}
|
| 31 |
+
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 5266.6355534009635, "train/update_time": 2966.368956397753, "train/lr": 0.0007913194498130252, "train/loss": 5.913437843322754, "train/global_grad_norm": 0.3435675799846649}
|
| 32 |
+
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 5336.696560251992, "train/update_time": 3036.3103124498157, "train/lr": 0.000778279642970672, "train/loss": 5.829973220825195, "train/global_grad_norm": 0.4446646273136139}
|
| 33 |
+
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 5525.4722395399585, "train/update_time": 3106.2408556328155, "train/lr": 0.0007649596321166025, "train/loss": 5.8708319664001465, "train/global_grad_norm": 0.37984520196914673}
|
| 34 |
+
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 5595.524489760981, "train/update_time": 3176.177008557832, "train/lr": 0.0007513728293726579, "train/loss": 5.832010746002197, "train/global_grad_norm": 0.45653313398361206}
|
| 35 |
+
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 5784.816746937926, "train/update_time": 3246.116620109882, "train/lr": 0.0007375329154974975, "train/loss": 5.804886817932129, "train/global_grad_norm": 0.3863084316253662}
|
| 36 |
+
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 5854.861399552901, "train/update_time": 3316.0491548541468, "train/lr": 0.0007234538261112341, "train/loss": 5.737117767333984, "train/global_grad_norm": 0.4480031728744507}
|
| 37 |
+
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 6043.423031610902, "train/update_time": 3385.989506291109, "train/lr": 0.0007091497376634464, "train/loss": 5.769798755645752, "train/global_grad_norm": 0.39961710572242737}
|
| 38 |
+
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 6113.463035934954, "train/update_time": 3455.9149778021965, "train/lr": 0.0006946350531586958, "train/loss": 5.74267578125, "train/global_grad_norm": 0.4710085391998291}
|
| 39 |
+
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 6302.139852438006, "train/update_time": 3525.8512225542217, "train/lr": 0.0006799243876539214, "train/loss": 5.759049415588379, "train/global_grad_norm": 0.44018295407295227}
|
| 40 |
+
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 6372.185268521891, "train/update_time": 3595.7855843111174, "train/lr": 0.0006650325535423166, "train/loss": 5.66196346282959, "train/global_grad_norm": 0.3946307301521301}
|
| 41 |
+
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 6561.41529466596, "train/update_time": 3665.7160565470112, "train/lr": 0.0006499745456385053, "train/loss": 5.682401657104492, "train/global_grad_norm": 0.3963249623775482}
|
| 42 |
+
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 6631.451806944911, "train/update_time": 3735.644923887099, "train/lr": 0.0006347655260800339, "train/loss": 5.6978607177734375, "train/global_grad_norm": 0.5203631520271301}
|
| 43 |
+
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 6820.405654589995, "train/update_time": 3805.5842581341276, "train/lr": 0.0006194208090603844, "train/loss": 5.692924976348877, "train/global_grad_norm": 0.3886179029941559}
|
| 44 |
+
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 6890.435664905934, "train/update_time": 3875.508329823031, "train/lr": 0.0006039558454088796, "train/loss": 5.726889133453369, "train/global_grad_norm": 0.4617585837841034}
|
| 45 |
+
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 7078.806805928936, "train/update_time": 3945.447064547101, "train/lr": 0.0005883862070330078, "train/loss": 5.670354843139648, "train/global_grad_norm": 0.42168015241622925}
|
| 46 |
+
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 7148.850932181929, "train/update_time": 4015.3776673960965, "train/lr": 0.0005727275712388317, "train/loss": 5.643796443939209, "train/global_grad_norm": 0.478407084941864}
|
| 47 |
+
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 7337.074188055005, "train/update_time": 4085.330211903318, "train/lr": 0.0005569957049452703, "train/loss": 5.656505107879639, "train/global_grad_norm": 0.3445208668708801}
|
| 48 |
+
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 7407.169766063918, "train/update_time": 4155.28995017719, "train/lr": 0.0005412064488081482, "train/loss": 5.64232063293457, "train/global_grad_norm": 0.5094632506370544}
|
| 49 |
+
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 7595.736356125912, "train/update_time": 4225.240399566013, "train/lr": 0.0005253757012699972, "train/loss": 5.636730670928955, "train/global_grad_norm": 0.37454232573509216}
|
| 50 |
+
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 7665.789839876001, "train/update_time": 4295.186003304552, "train/lr": 0.0005095194025516734, "train/loss": 5.6238884925842285, "train/global_grad_norm": 0.3839475214481354}
|
| 51 |
+
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 7854.928577505983, "train/update_time": 4365.119778911234, "train/lr": 0.0004936535186019053, "train/loss": 5.618503093719482, "train/global_grad_norm": 0.4643455743789673}
|
| 52 |
+
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 7924.9846870569745, "train/update_time": 4435.057907043141, "train/lr": 0.00047779402502093696, "train/loss": 5.616159915924072, "train/global_grad_norm": 0.39497897028923035}
|
| 53 |
+
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 8113.922514467966, "train/update_time": 4505.009558844264, "train/lr": 0.0004619568909744525, "train/loss": 5.587278842926025, "train/global_grad_norm": 0.514954686164856}
|
| 54 |
+
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 8183.984758203966, "train/update_time": 4574.958235046361, "train/lr": 0.00044615806311398067, "train/loss": 5.59569787979126, "train/global_grad_norm": 0.432810515165329}
|
| 55 |
+
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 8372.430396109005, "train/update_time": 4644.909644760657, "train/lr": 0.0004304134495199673, "train/loss": 5.539417266845703, "train/global_grad_norm": 0.3852292597293854}
|
| 56 |
+
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 8442.485554668936, "train/update_time": 4714.8530240497785, "train/lr": 0.0004147389036836882, "train/loss": 5.578530311584473, "train/global_grad_norm": 0.3600296974182129}
|
| 57 |
+
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 8631.0834149539, "train/update_time": 4784.797394883935, "train/lr": 0.0003991502085441259, "train/loss": 5.533158302307129, "train/global_grad_norm": 0.48202306032180786}
|
| 58 |
+
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 8701.162048408994, "train/update_time": 4854.753052946762, "train/lr": 0.0003836630605958888, "train/loss": 5.592592239379883, "train/global_grad_norm": 0.4211496114730835}
|
| 59 |
+
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 8889.847440016922, "train/update_time": 4924.708595188917, "train/lr": 0.00036829305408417155, "train/loss": 5.592873573303223, "train/global_grad_norm": 0.3531559407711029}
|
| 60 |
+
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 8959.925166409928, "train/update_time": 4994.663271273952, "train/lr": 0.000353055665302672, "train/loss": 5.583993911743164, "train/global_grad_norm": 0.379597932100296}
|
| 61 |
+
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 9149.922014020965, "train/update_time": 5064.615168323857, "train/lr": 0.0003379662370102746, "train/loss": 5.558979511260986, "train/global_grad_norm": 0.3696227967739105}
|
| 62 |
+
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 9220.005188893992, "train/update_time": 5134.568920494756, "train/lr": 0.00032303996298219405, "train/loss": 5.527538299560547, "train/global_grad_norm": 0.3672289252281189}
|
| 63 |
+
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 9408.833663397934, "train/update_time": 5204.52180329978, "train/lr": 0.00030829187271113034, "train/loss": 5.5501933097839355, "train/global_grad_norm": 0.4155080318450928}
|
| 64 |
+
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 9478.914532593917, "train/update_time": 5274.477632154711, "train/lr": 0.0002937368162738445, "train/loss": 5.538198471069336, "train/global_grad_norm": 0.4198897182941437}
|
| 65 |
+
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 9667.797854268923, "train/update_time": 5344.431819725665, "train/lr": 0.0002793894493783894, "train/loss": 5.504896640777588, "train/global_grad_norm": 0.4164905846118927}
|
| 66 |
+
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 9737.887706990936, "train/update_time": 5414.394234810607, "train/lr": 0.00026526421860705474, "train/loss": 5.531781196594238, "train/global_grad_norm": 0.35902249813079834}
|
| 67 |
+
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 9926.87084120093, "train/update_time": 5484.349449058645, "train/lr": 0.0002513753468698824, "train/loss": 5.486477375030518, "train/global_grad_norm": 0.3695092499256134}
|
| 68 |
+
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 9996.965703879949, "train/update_time": 5554.316489073681, "train/lr": 0.00023773681908340283, "train/loss": 5.501741886138916, "train/global_grad_norm": 0.38276293873786926}
|
| 69 |
+
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 10185.451445175917, "train/update_time": 5624.270565029001, "train/lr": 0.00022436236808900823, "train/loss": 5.51205587387085, "train/global_grad_norm": 0.34062203764915466}
|
| 70 |
+
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 10255.549305356923, "train/update_time": 5694.239998900797, "train/lr": 0.00021126546082514682, "train/loss": 5.508440971374512, "train/global_grad_norm": 0.33368465304374695}
|
| 71 |
+
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 10444.704105362995, "train/update_time": 5764.182670753566, "train/lr": 0.00019845928476725522, "train/loss": 5.496548175811768, "train/global_grad_norm": 0.30512407422065735}
|
| 72 |
+
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 10514.781046306947, "train/update_time": 5834.130611701286, "train/lr": 0.0001859567346490913, "train/loss": 5.488737106323242, "train/global_grad_norm": 0.3369419574737549}
|
| 73 |
+
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 10703.836294994922, "train/update_time": 5904.080027965596, "train/lr": 0.00017377039947882782, "train/loss": 5.5062255859375, "train/global_grad_norm": 0.3015817403793335}
|
| 74 |
+
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 10773.90538297291, "train/update_time": 5974.028263681685, "train/lr": 0.00016191254986299043, "train/loss": 5.500577926635742, "train/global_grad_norm": 0.3552543520927429}
|
| 75 |
+
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 10963.04556359991, "train/update_time": 6043.978755204822, "train/lr": 0.00015039512565099468, "train/loss": 5.495687007904053, "train/global_grad_norm": 0.3207867443561554}
|
| 76 |
+
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 11033.109079251997, "train/update_time": 6113.921902889037, "train/lr": 0.00013922972391273224, "train/loss": 5.435054779052734, "train/global_grad_norm": 0.3032216727733612}
|
| 77 |
+
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 11222.254815177992, "train/update_time": 6183.873032106203, "train/lr": 0.00012842758726130281, "train/loss": 5.514736652374268, "train/global_grad_norm": 0.2982519865036011}
|
| 78 |
+
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 11292.312823130982, "train/update_time": 6253.814012841205, "train/lr": 0.00011799959253265679, "train/loss": 5.426453113555908, "train/global_grad_norm": 0.2868562340736389}
|
| 79 |
+
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 11480.794340396998, "train/update_time": 6323.765302975196, "train/lr": 0.00010795623983354214, "train/loss": 5.461309432983398, "train/global_grad_norm": 0.28381142020225525}
|
| 80 |
+
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 11550.859713669983, "train/update_time": 6393.710524166352, "train/lr": 9.830764196878872e-05, "train/loss": 5.43646240234375, "train/global_grad_norm": 0.2892729640007019}
|
| 81 |
+
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 11740.033289006911, "train/update_time": 6463.652797408402, "train/lr": 8.906351425856951e-05, "train/loss": 5.4056525230407715, "train/global_grad_norm": 0.2663191556930542}
|
| 82 |
+
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 11810.107688057935, "train/update_time": 6533.614025922609, "train/lr": 8.02331647558977e-05, "train/loss": 5.44175386428833, "train/global_grad_norm": 0.27445027232170105}
|
| 83 |
+
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 11998.483695015893, "train/update_time": 6603.55498173763, "train/lr": 7.182548487420554e-05, "train/loss": 5.469080448150635, "train/global_grad_norm": 0.25448915362358093}
|
| 84 |
+
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 12068.55913664191, "train/update_time": 6673.514490786707, "train/lr": 6.384894043444556e-05, "train/loss": 5.415605545043945, "train/global_grad_norm": 0.2630438804626465}
|
| 85 |
+
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 12257.111987237004, "train/update_time": 6743.462062919862, "train/lr": 5.6311563140726166e-05, "train/loss": 5.500694751739502, "train/global_grad_norm": 0.2526546120643616}
|
| 86 |
+
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 12327.182909888914, "train/update_time": 6813.415315568796, "train/lr": 4.922094249306547e-05, "train/loss": 5.468751430511475, "train/global_grad_norm": 0.23347875475883484}
|
| 87 |
+
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 12515.601697899983, "train/update_time": 6883.366675395635, "train/lr": 4.2584218145409916e-05, "train/loss": 5.40645694732666, "train/global_grad_norm": 0.23998965322971344}
|
| 88 |
+
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 12585.657294680947, "train/update_time": 6953.310590816545, "train/lr": 3.6408072716606236e-05, "train/loss": 5.436767578125, "train/global_grad_norm": 0.23755612969398499}
|
| 89 |
+
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 12774.815754264942, "train/update_time": 7023.26507879165, "train/lr": 3.069872506157217e-05, "train/loss": 5.502988338470459, "train/global_grad_norm": 0.22962626814842224}
|
| 90 |
+
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 12844.891541385907, "train/update_time": 7093.223455801839, "train/lr": 2.5461924009435368e-05, "train/loss": 5.398458957672119, "train/global_grad_norm": 0.23118726909160614}
|
| 91 |
+
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 13034.968514320906, "train/update_time": 7163.176251192926, "train/lr": 2.0702942574950812e-05, "train/loss": 5.4442338943481445, "train/global_grad_norm": 0.22224336862564087}
|
| 92 |
+
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 13105.045270575909, "train/update_time": 7233.1264379567, "train/lr": 1.642657264902142e-05, "train/loss": 5.47895622253418, "train/global_grad_norm": 0.21664276719093323}
|
| 93 |
+
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 13294.375740742893, "train/update_time": 7303.080043341848, "train/lr": 1.2637120173670358e-05, "train/loss": 5.455996036529541, "train/global_grad_norm": 0.20131263136863708}
|
| 94 |
+
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 13364.448509541922, "train/update_time": 7373.0204275009455, "train/lr": 9.338400806321978e-06, "train/loss": 5.40268611907959, "train/global_grad_norm": 0.20658168196678162}
|
| 95 |
+
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 13553.81382910593, "train/update_time": 7442.970160528901, "train/lr": 6.533736077758867e-06, "train/loss": 5.440489768981934, "train/global_grad_norm": 0.19446603953838348}
|
| 96 |
+
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 13623.899586633896, "train/update_time": 7512.917904903879, "train/lr": 4.2259500476214406e-06, "train/loss": 5.4255051612854, "train/global_grad_norm": 0.1936364620923996}
|
| 97 |
+
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 13812.626531647984, "train/update_time": 7582.864485563943, "train/lr": 2.417366460819359e-06, "train/loss": 5.462730407714844, "train/global_grad_norm": 0.2014123499393463}
|
| 98 |
+
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 13882.689546601963, "train/update_time": 7652.812096997048, "train/lr": 1.1098064077174619e-06, "train/loss": 5.443710803985596, "train/global_grad_norm": 0.18211401998996735}
|
metrics/jsonlines/train_eval.jsonl
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
-
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 2 |
-
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 3 |
-
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 4 |
-
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 5 |
-
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 6 |
-
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 7 |
-
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 8 |
-
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 9 |
-
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 10 |
-
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 11 |
-
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 12 |
-
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 13 |
-
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 14 |
-
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 15 |
-
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 16 |
-
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 17 |
-
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 18 |
-
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 19 |
-
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
|
|
|
| 1 |
+
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1454.5109360769857, "train_eval/train_update_time": 899.2642310586525, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.402821731921286, "train_eval/perplexity_len_2048": 4459.632898714833, "train_eval/loss_avg_len_1024": 8.407352303462105, "train_eval/perplexity_len_1024": 4479.883423194555, "train_eval/loss_avg_len_512": 8.407402739301325, "train_eval/perplexity_len_512": 4480.109375572615}
|
| 2 |
+
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2481.5876440349966, "train_eval/train_update_time": 1497.7818741976516, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.995038702964084, "train_eval/perplexity_len_2048": 1091.2059098186799, "train_eval/loss_avg_len_1024": 7.002110293316655, "train_eval/perplexity_len_1024": 1098.9498196110778, "train_eval/loss_avg_len_512": 7.007181799188256, "train_eval/perplexity_len_512": 1104.5373065844053}
|
| 3 |
+
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3188.29143246694, "train_eval/train_update_time": 1847.4134167138254, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.550251329710736, "train_eval/perplexity_len_2048": 699.4199367381216, "train_eval/loss_avg_len_1024": 6.5585846149979625, "train_eval/perplexity_len_1024": 705.2727553400579, "train_eval/loss_avg_len_512": 6.569192992824828, "train_eval/perplexity_len_512": 712.7943807832622}
|
| 4 |
+
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3776.9122951189056, "train_eval/train_update_time": 2197.046115837642, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.272974732764633, "train_eval/perplexity_len_2048": 530.0517974267286, "train_eval/loss_avg_len_1024": 6.284432089452021, "train_eval/perplexity_len_1024": 536.1597134027296, "train_eval/loss_avg_len_512": 6.298539984831004, "train_eval/perplexity_len_512": 543.7774070109691}
|
| 5 |
+
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4485.399678221904, "train_eval/train_update_time": 2546.7019683559192, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.0823833193690735, "train_eval/perplexity_len_2048": 438.0720170587346, "train_eval/loss_avg_len_1024": 6.09325327728664, "train_eval/perplexity_len_1024": 442.85981589278384, "train_eval/loss_avg_len_512": 6.109832875879656, "train_eval/perplexity_len_512": 450.2634589947275}
|
| 6 |
+
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5077.3983478549635, "train_eval/train_update_time": 2896.4371623727493, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.950598046767519, "train_eval/perplexity_len_2048": 383.98291014534203, "train_eval/loss_avg_len_1024": 5.961579375055372, "train_eval/perplexity_len_1024": 388.2227896861234, "train_eval/loss_avg_len_512": 5.980496873812372, "train_eval/perplexity_len_512": 395.63690094056886}
|
| 7 |
+
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5784.816746937926, "train_eval/train_update_time": 3246.116620109882, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.849003767906725, "train_eval/perplexity_len_2048": 346.888626699115, "train_eval/loss_avg_len_1024": 5.863815515086607, "train_eval/perplexity_len_1024": 352.0648934809872, "train_eval/loss_avg_len_512": 5.8864257618402185, "train_eval/perplexity_len_512": 360.1158415743008}
|
| 8 |
+
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6372.185268521891, "train_eval/train_update_time": 3595.7855843111174, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.767166235738806, "train_eval/perplexity_len_2048": 319.6306900556352, "train_eval/loss_avg_len_1024": 5.7811188802827385, "train_eval/perplexity_len_1024": 324.1216409233059, "train_eval/loss_avg_len_512": 5.805235980615107, "train_eval/perplexity_len_512": 332.03353757775665}
|
| 9 |
+
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7078.806805928936, "train_eval/train_update_time": 3945.447064547101, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.699684085610935, "train_eval/perplexity_len_2048": 298.77299936686927, "train_eval/loss_avg_len_1024": 5.717875908720817, "train_eval/perplexity_len_1024": 304.2579644118213, "train_eval/loss_avg_len_512": 5.747566951586341, "train_eval/perplexity_len_512": 313.42714840543255}
|
| 10 |
+
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7665.789839876001, "train_eval/train_update_time": 4295.186003304552, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.646992232510201, "train_eval/perplexity_len_2048": 283.4376678519485, "train_eval/loss_avg_len_1024": 5.66428641970735, "train_eval/perplexity_len_1024": 288.3821238771646, "train_eval/loss_avg_len_512": 5.695631070602831, "train_eval/perplexity_len_512": 297.5645185661324}
|
| 11 |
+
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8372.430396109005, "train_eval/train_update_time": 4644.909644760657, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.59726160423892, "train_eval/perplexity_len_2048": 269.68688591179045, "train_eval/loss_avg_len_1024": 5.613586209967179, "train_eval/perplexity_len_1024": 274.1255491357345, "train_eval/loss_avg_len_512": 5.643964380473335, "train_eval/perplexity_len_512": 282.5807584840765}
|
| 12 |
+
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8959.925166409928, "train_eval/train_update_time": 4994.663271273952, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.558144126648113, "train_eval/perplexity_len_2048": 259.3410852175999, "train_eval/loss_avg_len_1024": 5.575471486838069, "train_eval/perplexity_len_1024": 263.87393940131795, "train_eval/loss_avg_len_512": 5.6089897783461495, "train_eval/perplexity_len_512": 272.8684411123516}
|
| 13 |
+
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9667.797854268923, "train_eval/train_update_time": 5344.431819725665, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.5347687173953455, "train_eval/perplexity_len_2048": 253.3491856010016, "train_eval/loss_avg_len_1024": 5.557624726276845, "train_eval/perplexity_len_1024": 259.20641833769395, "train_eval/loss_avg_len_512": 5.592303494717344, "train_eval/perplexity_len_512": 268.3530581593862}
|
| 14 |
+
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10255.549305356923, "train_eval/train_update_time": 5694.239998900797, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.504252203528103, "train_eval/perplexity_len_2048": 245.7346274669015, "train_eval/loss_avg_len_1024": 5.52741995374381, "train_eval/perplexity_len_1024": 251.49420656403856, "train_eval/loss_avg_len_512": 5.566598175862454, "train_eval/perplexity_len_512": 261.5428613637092}
|
| 15 |
+
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10963.04556359991, "train_eval/train_update_time": 6043.978755204822, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.4850125069963545, "train_eval/perplexity_len_2048": 241.0519588151443, "train_eval/loss_avg_len_1024": 5.509102286305788, "train_eval/perplexity_len_1024": 246.92935567481706, "train_eval/loss_avg_len_512": 5.548407077461597, "train_eval/perplexity_len_512": 256.82812258960223}
|
| 16 |
+
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11550.859713669983, "train_eval/train_update_time": 6393.710524166352, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.46941686344535, "train_eval/perplexity_len_2048": 237.32176140860722, "train_eval/loss_avg_len_1024": 5.495577427410535, "train_eval/perplexity_len_1024": 243.61215389080553, "train_eval/loss_avg_len_512": 5.534824562655558, "train_eval/perplexity_len_512": 253.36333434726268}
|
| 17 |
+
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12257.111987237004, "train_eval/train_update_time": 6743.462062919862, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.450519698281187, "train_eval/perplexity_len_2048": 232.87916136425875, "train_eval/loss_avg_len_1024": 5.47038564050632, "train_eval/perplexity_len_1024": 237.55178468977783, "train_eval/loss_avg_len_512": 5.510487729436864, "train_eval/perplexity_len_512": 247.27169934858398}
|
| 18 |
+
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12844.891541385907, "train_eval/train_update_time": 7093.223455801839, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.449968136312364, "train_eval/perplexity_len_2048": 232.7507494923267, "train_eval/loss_avg_len_1024": 5.473881935751124, "train_eval/perplexity_len_1024": 238.3837894846464, "train_eval/loss_avg_len_512": 5.514598703888769, "train_eval/perplexity_len_512": 248.29031931304468}
|
| 19 |
+
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 13553.81382910593, "train_eval/train_update_time": 7442.970160528901, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.439999618460788, "train_eval/perplexity_len_2048": 230.4420955379297, "train_eval/loss_avg_len_1024": 5.46676403799378, "train_eval/perplexity_len_1024": 236.69302253570154, "train_eval/loss_avg_len_512": 5.506935359557975, "train_eval/perplexity_len_512": 246.39485716788857}
|
metrics/jsonlines/val.jsonl
CHANGED
|
@@ -1,49 +1,49 @@
|
|
| 1 |
-
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time":
|
| 2 |
-
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time":
|
| 3 |
-
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time":
|
| 4 |
-
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time":
|
| 5 |
-
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time":
|
| 6 |
-
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time":
|
| 7 |
-
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time":
|
| 8 |
-
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time":
|
| 9 |
-
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time":
|
| 10 |
-
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time":
|
| 11 |
-
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time":
|
| 12 |
-
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time":
|
| 13 |
-
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time":
|
| 14 |
-
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time":
|
| 15 |
-
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time":
|
| 16 |
-
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time":
|
| 17 |
-
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time":
|
| 18 |
-
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time":
|
| 19 |
-
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time":
|
| 20 |
-
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time":
|
| 21 |
-
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time":
|
| 22 |
-
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time":
|
| 23 |
-
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time":
|
| 24 |
-
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time":
|
| 25 |
-
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time":
|
| 26 |
-
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time":
|
| 27 |
-
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time":
|
| 28 |
-
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time":
|
| 29 |
-
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time":
|
| 30 |
-
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time":
|
| 31 |
-
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time":
|
| 32 |
-
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time":
|
| 33 |
-
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time":
|
| 34 |
-
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time":
|
| 35 |
-
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time":
|
| 36 |
-
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time":
|
| 37 |
-
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time":
|
| 38 |
-
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time":
|
| 39 |
-
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time":
|
| 40 |
-
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time":
|
| 41 |
-
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time":
|
| 42 |
-
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time":
|
| 43 |
-
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time":
|
| 44 |
-
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time":
|
| 45 |
-
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time":
|
| 46 |
-
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time":
|
| 47 |
-
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time":
|
| 48 |
-
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time":
|
| 49 |
-
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time":
|
|
|
|
| 1 |
+
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 371.2564815880032, "val/train_update_time": 370.74722005974036, "val/loss": 8.074574615192414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 268.5667409999296, "val/val_tokens_per_second": 152513.30022286988, "val/loss_avg_len_2048": 8.074574615192414, "val/perplexity_len_2048": 3211.7608437878293, "val/loss_avg_len_1024": 8.077264636135101, "val/perplexity_len_1024": 3220.41217863958, "val/loss_avg_len_512": 8.07784064731598, "val/perplexity_len_512": 3222.2677064125833}
|
| 2 |
+
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 989.879293700913, "val/train_update_time": 720.4508380297339, "val/loss": 7.447038386821747, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 285.6286050810013, "val/val_tokens_per_second": 143403.00401069483, "val/loss_avg_len_2048": 7.447038386821747, "val/perplexity_len_2048": 1714.7771111702812, "val/loss_avg_len_1024": 7.450830672264099, "val/perplexity_len_1024": 1721.2923815200618, "val/loss_avg_len_512": 7.45331780166626, "val/perplexity_len_512": 1725.5787866243627}
|
| 3 |
+
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 1606.152420823928, "val/train_update_time": 1050.7343851425685, "val/loss": 7.138498369634151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 304.81948690803256, "val/val_tokens_per_second": 134374.61107057793, "val/loss_avg_len_2048": 7.138498369634151, "val/perplexity_len_2048": 1259.5356114067852, "val/loss_avg_len_1024": 7.143596195149422, "val/perplexity_len_1024": 1265.9728983258412, "val/loss_avg_len_512": 7.148876025867462, "val/perplexity_len_512": 1272.6746975366846}
|
| 4 |
+
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 2218.4671919340035, "val/train_update_time": 1357.900893958984, "val/loss": 6.91851605821252, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 122.97149007802363, "val/val_tokens_per_second": 333085.3352595099, "val/loss_avg_len_2048": 6.91851605821252, "val/perplexity_len_2048": 1010.8188846480316, "val/loss_avg_len_1024": 6.924789595830441, "val/perplexity_len_1024": 1017.1802281458952, "val/loss_avg_len_512": 6.9318418387413026, "val/perplexity_len_512": 1024.378984051259}
|
| 5 |
+
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 2481.5876440349966, "val/train_update_time": 1497.7818741976516, "val/loss": 6.731239570814371, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.63907289097551, "val/val_tokens_per_second": 345248.82066164305, "val/loss_avg_len_2048": 6.731239570814371, "val/perplexity_len_2048": 838.1856126915511, "val/loss_avg_len_1024": 6.738856630378962, "val/perplexity_len_1024": 844.5944998806121, "val/loss_avg_len_512": 6.748001024723053, "val/perplexity_len_512": 852.3532254060005}
|
| 6 |
+
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 2740.80689570494, "val/train_update_time": 1637.629011878511, "val/loss": 6.573862080945075, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.79705565003678, "val/val_tokens_per_second": 344789.6900800615, "val/loss_avg_len_2048": 6.573862080945075, "val/perplexity_len_2048": 716.1302622607554, "val/loss_avg_len_1024": 6.582782233029604, "val/perplexity_len_1024": 722.5468289411266, "val/loss_avg_len_512": 6.593902683746815, "val/perplexity_len_512": 730.626718081672}
|
| 7 |
+
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 2999.716264599003, "val/train_update_time": 1777.492755148909, "val/loss": 6.449963170717657, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.52784067601897, "val/val_tokens_per_second": 345572.8187266909, "val/loss_avg_len_2048": 6.449963170717657, "val/perplexity_len_2048": 632.6789912699647, "val/loss_avg_len_1024": 6.4598856550842525, "val/perplexity_len_1024": 638.9879873645075, "val/loss_avg_len_512": 6.472617136281729, "val/perplexity_len_512": 647.1752583647682}
|
| 8 |
+
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 3258.342541063903, "val/train_update_time": 1917.3395090608392, "val/loss": 6.350446031192691, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.16394796001259, "val/val_tokens_per_second": 343728.1216441805, "val/loss_avg_len_2048": 6.350446031192691, "val/perplexity_len_2048": 572.7481155748915, "val/loss_avg_len_1024": 6.361272199109942, "val/perplexity_len_1024": 578.982469035299, "val/loss_avg_len_512": 6.375198934513331, "val/perplexity_len_512": 587.1022142184726}
|
| 9 |
+
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 3517.6349648769246, "val/train_update_time": 2057.1937958986964, "val/loss": 6.245061218327843, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.16180873999838, "val/val_tokens_per_second": 343734.2923299484, "val/loss_avg_len_2048": 6.245061218327843, "val/perplexity_len_2048": 515.460779607975, "val/loss_avg_len_1024": 6.256822197538241, "val/perplexity_len_1024": 521.5588927204132, "val/loss_avg_len_512": 6.2725997325658795, "val/perplexity_len_512": 529.8530651619924}
|
| 10 |
+
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 3776.9122951189056, "val/train_update_time": 2197.046115837642, "val/loss": 6.165311972237379, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.31773394800257, "val/val_tokens_per_second": 343285.09807142284, "val/loss_avg_len_2048": 6.165311972237379, "val/perplexity_len_2048": 475.9496028389593, "val/loss_avg_len_1024": 6.1779335115781056, "val/perplexity_len_1024": 481.99488963534884, "val/loss_avg_len_512": 6.19536230119653, "val/perplexity_len_512": 490.4691103574319}
|
| 11 |
+
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 4036.799753597006, "val/train_update_time": 2336.9093162972713, "val/loss": 6.09605717837629, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.27474389097188, "val/val_tokens_per_second": 343408.8279195235, "val/loss_avg_len_2048": 6.09605717837629, "val/perplexity_len_2048": 444.10329349255574, "val/loss_avg_len_1024": 6.1096146122057, "val/perplexity_len_1024": 450.1651935621965, "val/loss_avg_len_512": 6.128595473396592, "val/perplexity_len_512": 458.7913233213098}
|
| 12 |
+
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 4296.169194732909, "val/train_update_time": 2476.7631880298723, "val/loss": 6.035459959003074, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.14929955196567, "val/val_tokens_per_second": 343770.3801366935, "val/loss_avg_len_2048": 6.035459959003074, "val/perplexity_len_2048": 417.99102416430844, "val/loss_avg_len_1024": 6.0497336251823235, "val/perplexity_len_1024": 424.00007205896117, "val/loss_avg_len_512": 6.069915484577604, "val/perplexity_len_512": 432.6441149291548}
|
| 13 |
+
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 4555.462784307892, "val/train_update_time": 2616.633614034741, "val/loss": 5.978946462819375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 121.9430886899354, "val/val_tokens_per_second": 335894.39500051504, "val/loss_avg_len_2048": 5.978946462819375, "val/perplexity_len_2048": 395.02397640536174, "val/loss_avg_len_1024": 5.994191604967321, "val/perplexity_len_1024": 401.0923118656187, "val/loss_avg_len_512": 6.015902790261898, "val/perplexity_len_512": 409.8957217942098}
|
| 14 |
+
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 4817.594658994931, "val/train_update_time": 2756.5458030648297, "val/loss": 5.9318083871576475, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.64603293093387, "val/val_tokens_per_second": 342343.15168346884, "val/loss_avg_len_2048": 5.9318083871576475, "val/perplexity_len_2048": 376.8353622500592, "val/loss_avg_len_1024": 5.947726577504678, "val/perplexity_len_1024": 382.88189654128325, "val/loss_avg_len_512": 5.970375245298259, "val/perplexity_len_512": 391.6526089792945}
|
| 15 |
+
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 5077.3983478549635, "val/train_update_time": 2896.4371623727493, "val/loss": 5.8920511382135325, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.62661699997261, "val/val_tokens_per_second": 345285.0720678434, "val/loss_avg_len_2048": 5.8920511382135325, "val/perplexity_len_2048": 362.1473373280599, "val/loss_avg_len_1024": 5.908480526011203, "val/perplexity_len_1024": 368.1463414158124, "val/loss_avg_len_512": 5.932198676339816, "val/perplexity_len_512": 376.98246571998305}
|
| 16 |
+
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 5336.696560251992, "val/train_update_time": 3036.3103124498157, "val/loss": 5.855954391597583, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.72400588693563, "val/val_tokens_per_second": 345001.83593036287, "val/loss_avg_len_2048": 5.855954391597583, "val/perplexity_len_2048": 349.30811775115666, "val/loss_avg_len_1024": 5.873065555681661, "val/perplexity_len_1024": 355.3366164948888, "val/loss_avg_len_512": 5.897885296325106, "val/perplexity_len_512": 364.26633743411116}
|
| 17 |
+
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 5595.524489760981, "val/train_update_time": 3176.177008557832, "val/loss": 5.820256409952801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.2323824360501, "val/val_tokens_per_second": 343530.83586137986, "val/loss_avg_len_2048": 5.820256409952801, "val/perplexity_len_2048": 337.0584676966778, "val/loss_avg_len_1024": 5.837889617230394, "val/perplexity_len_1024": 343.0545996773094, "val/loss_avg_len_512": 5.863783256886014, "val/perplexity_len_512": 352.05353668420764}
|
| 18 |
+
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 5854.861399552901, "val/train_update_time": 3316.0491548541468, "val/loss": 5.786057632419194, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.49763245903887, "val/val_tokens_per_second": 345660.91448416625, "val/loss_avg_len_2048": 5.786057632419194, "val/perplexity_len_2048": 325.72635675500464, "val/loss_avg_len_1024": 5.804369036847027, "val/perplexity_len_1024": 331.74580791230153, "val/loss_avg_len_512": 5.831280016766, "val/perplexity_len_512": 340.794622835555}
|
| 19 |
+
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 6113.463035934954, "val/train_update_time": 3455.9149778021965, "val/loss": 5.759885396619764, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.62306807097048, "val/val_tokens_per_second": 345295.40220199176, "val/loss_avg_len_2048": 5.759885396619764, "val/perplexity_len_2048": 317.3119618105809, "val/loss_avg_len_1024": 5.778556486419566, "val/perplexity_len_1024": 323.29217678044284, "val/loss_avg_len_512": 5.806458696108102, "val/perplexity_len_512": 332.43976843011467}
|
| 20 |
+
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 6372.185268521891, "val/train_update_time": 3595.7855843111174, "val/loss": 5.7300529724414755, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.57840889098588, "val/val_tokens_per_second": 345425.4478794386, "val/loss_avg_len_2048": 5.7300529724414755, "val/perplexity_len_2048": 307.98558269355703, "val/loss_avg_len_1024": 5.7494624780176675, "val/perplexity_len_1024": 314.0218212805507, "val/loss_avg_len_512": 5.778485274661239, "val/perplexity_len_512": 323.2691553957875}
|
| 21 |
+
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 6631.451806944911, "val/train_update_time": 3735.644923887099, "val/loss": 5.7072238631833025, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.90358249703422, "val/val_tokens_per_second": 344480.78972743865, "val/loss_avg_len_2048": 5.7072238631833025, "val/perplexity_len_2048": 301.03419507097897, "val/loss_avg_len_1024": 5.727206208104669, "val/perplexity_len_1024": 307.1100671050411, "val/loss_avg_len_512": 5.757021965130791, "val/perplexity_len_512": 316.40466036500106}
|
| 22 |
+
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 6890.435664905934, "val/train_update_time": 3875.508329823031, "val/loss": 5.683767916635869, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.32042055693455, "val/val_tokens_per_second": 346178.6207926, "val/loss_avg_len_2048": 5.683767916635869, "val/perplexity_len_2048": 294.0553210975227, "val/loss_avg_len_1024": 5.704273862084427, "val/perplexity_len_1024": 300.1474524536417, "val/loss_avg_len_512": 5.735127307290689, "val/perplexity_len_512": 309.5523765252151}
|
| 23 |
+
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 7148.850932181929, "val/train_update_time": 4015.3776673960965, "val/loss": 5.66337805530862, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.15536998491734, "val/val_tokens_per_second": 346662.1957616365, "val/loss_avg_len_2048": 5.66337805530862, "val/perplexity_len_2048": 288.12028676236497, "val/loss_avg_len_1024": 5.684271093870421, "val/perplexity_len_1024": 294.2033202725829, "val/loss_avg_len_512": 5.7158169909281895, "val/perplexity_len_512": 303.6321667294071}
|
| 24 |
+
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 7407.169766063918, "val/train_update_time": 4155.28995017719, "val/loss": 5.643267921690471, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.50208243296947, "val/val_tokens_per_second": 345647.93427296064, "val/loss_avg_len_2048": 5.643267921690471, "val/perplexity_len_2048": 282.3840211506362, "val/loss_avg_len_1024": 5.664606115816004, "val/perplexity_len_1024": 288.47433325867826, "val/loss_avg_len_512": 5.697027105129534, "val/perplexity_len_512": 297.9802190065426}
|
| 25 |
+
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 7665.789839876001, "val/train_update_time": 4295.186003304552, "val/loss": 5.622699441415019, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.53706486790907, "val/val_tokens_per_second": 345545.927306733, "val/loss_avg_len_2048": 5.622699441415019, "val/perplexity_len_2048": 276.63513656770886, "val/loss_avg_len_1024": 5.6445637688343355, "val/perplexity_len_1024": 282.7501848727848, "val/loss_avg_len_512": 5.677907623612683, "val/perplexity_len_512": 292.3371102776823}
|
| 26 |
+
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 7924.9846870569745, "val/train_update_time": 4435.057907043141, "val/loss": 5.604714242819535, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.8660318760667, "val/val_tokens_per_second": 344589.61364762415, "val/loss_avg_len_2048": 5.604714242819535, "val/perplexity_len_2048": 271.7042728930096, "val/loss_avg_len_1024": 5.627045304850158, "val/perplexity_len_1024": 277.83997121976165, "val/loss_avg_len_512": 5.661085981074465, "val/perplexity_len_512": 287.4606499338183}
|
| 27 |
+
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 8183.984758203966, "val/train_update_time": 4574.958235046361, "val/loss": 5.590083757191172, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.37259386607911, "val/val_tokens_per_second": 346026.04084472556, "val/loss_avg_len_2048": 5.590083757191172, "val/perplexity_len_2048": 267.7580454362757, "val/loss_avg_len_1024": 5.612942422863492, "val/perplexity_len_1024": 273.9491274375101, "val/loss_avg_len_512": 5.647777539000603, "val/perplexity_len_512": 283.66034071391516}
|
| 28 |
+
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 8442.485554668936, "val/train_update_time": 4714.8530240497785, "val/loss": 5.574322423379664, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.52899952605367, "val/val_tokens_per_second": 345569.4400845478, "val/loss_avg_len_2048": 5.574322423379664, "val/perplexity_len_2048": 263.5709056357824, "val/loss_avg_len_1024": 5.597507244674646, "val/perplexity_len_1024": 269.75314005297037, "val/loss_avg_len_512": 5.633031649286917, "val/perplexity_len_512": 279.50820531578756}
|
| 29 |
+
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 8701.162048408994, "val/train_update_time": 4854.753052946762, "val/loss": 5.560131643292463, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.59739387896843, "val/val_tokens_per_second": 345370.1524149906, "val/loss_avg_len_2048": 5.560131643292463, "val/perplexity_len_2048": 259.85704250803946, "val/loss_avg_len_1024": 5.583764273371181, "val/perplexity_len_1024": 266.0712881317231, "val/loss_avg_len_512": 5.620282325731626, "val/perplexity_len_512": 275.96728490305725}
|
| 30 |
+
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 8959.925166409928, "val/train_update_time": 4994.663271273952, "val/loss": 5.546720407564999, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.35096273303498, "val/val_tokens_per_second": 343189.5232518534, "val/loss_avg_len_2048": 5.546720407564999, "val/perplexity_len_2048": 256.39530344075405, "val/loss_avg_len_1024": 5.570826970474656, "val/perplexity_len_1024": 262.6512142533728, "val/loss_avg_len_512": 5.607915740726924, "val/perplexity_len_512": 272.57552746979076}
|
| 31 |
+
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 9220.005188893992, "val/train_update_time": 5134.568920494756, "val/loss": 5.53471503632843, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.7409812399419, "val/val_tokens_per_second": 344952.5140543637, "val/loss_avg_len_2048": 5.53471503632843, "val/perplexity_len_2048": 253.33558591144265, "val/loss_avg_len_1024": 5.559162426136544, "val/perplexity_len_1024": 259.60530661743275, "val/loss_avg_len_512": 5.596828284146019, "val/perplexity_len_512": 269.5700504807311}
|
| 32 |
+
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 9478.914532593917, "val/train_update_time": 5274.477632154711, "val/loss": 5.523900157300186, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.79370030597784, "val/val_tokens_per_second": 344799.4287112786, "val/loss_avg_len_2048": 5.523900157300186, "val/perplexity_len_2048": 250.61055420070002, "val/loss_avg_len_1024": 5.5487858315113705, "val/perplexity_len_1024": 256.92541570504636, "val/loss_avg_len_512": 5.5871342391912835, "val/perplexity_len_512": 266.96945181888873}
|
| 33 |
+
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 9737.887706990936, "val/train_update_time": 5414.394234810607, "val/loss": 5.513743215830738, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.89046923699789, "val/val_tokens_per_second": 344518.784919166, "val/loss_avg_len_2048": 5.513743215830738, "val/perplexity_len_2048": 248.07800074089812, "val/loss_avg_len_1024": 5.538833538455277, "val/perplexity_len_1024": 254.38110056013733, "val/loss_avg_len_512": 5.577749973074766, "val/perplexity_len_512": 264.47585801169674}
|
| 34 |
+
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 9996.965703879949, "val/train_update_time": 5554.316489073681, "val/loss": 5.503565442725533, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.39389265398495, "val/val_tokens_per_second": 345963.7915589842, "val/loss_avg_len_2048": 5.503565442725533, "val/perplexity_len_2048": 245.56592449284855, "val/loss_avg_len_1024": 5.5291210433123865, "val/perplexity_len_1024": 251.92238481690993, "val/loss_avg_len_512": 5.568761392938602, "val/perplexity_len_512": 262.1092477350233}
|
| 35 |
+
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 10255.549305356923, "val/train_update_time": 5694.239998900797, "val/loss": 5.495382670491731, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.52405338105746, "val/val_tokens_per_second": 345583.86109452986, "val/loss_avg_len_2048": 5.495382670491731, "val/perplexity_len_2048": 243.56471335816636, "val/loss_avg_len_1024": 5.521260392003751, "val/perplexity_len_1024": 249.9498735613677, "val/loss_avg_len_512": 5.5613877866359545, "val/perplexity_len_512": 260.1836653018214}
|
| 36 |
+
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 10514.781046306947, "val/train_update_time": 5834.130611701286, "val/loss": 5.48703303927928, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.9761282489635, "val/val_tokens_per_second": 344270.7423987537, "val/loss_avg_len_2048": 5.48703303927928, "val/perplexity_len_2048": 241.53950446478578, "val/loss_avg_len_1024": 5.513113558725175, "val/perplexity_len_1024": 247.9218458321809, "val/loss_avg_len_512": 5.553624488961476, "val/perplexity_len_512": 258.17160228548784}
|
| 37 |
+
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 10773.90538297291, "val/train_update_time": 5974.028263681685, "val/loss": 5.480334459244821, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.07169013505336, "val/val_tokens_per_second": 343994.44530889246, "val/loss_avg_len_2048": 5.480334459244821, "val/perplexity_len_2048": 239.9269397393566, "val/loss_avg_len_1024": 5.506674018040026, "val/perplexity_len_1024": 246.33047237547763, "val/loss_avg_len_512": 5.547568697430181, "val/perplexity_len_512": 256.61289325471205}
|
| 38 |
+
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 11033.109079251997, "val/train_update_time": 6113.921902889037, "val/loss": 5.474184480784086, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.07278911396861, "val/val_tokens_per_second": 343991.2704219584, "val/loss_avg_len_2048": 5.474184480784086, "val/perplexity_len_2048": 238.4559222272448, "val/loss_avg_len_1024": 5.5007993760133225, "val/perplexity_len_1024": 244.88761132568, "val/loss_avg_len_512": 5.542205744937714, "val/perplexity_len_512": 255.24037416617912}
|
| 39 |
+
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 11292.312823130982, "val/train_update_time": 6253.814012841205, "val/loss": 5.468393029721306, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.41189982998185, "val/val_tokens_per_second": 345911.1800318311, "val/loss_avg_len_2048": 5.468393029721306, "val/perplexity_len_2048": 237.07890772793579, "val/loss_avg_len_1024": 5.495152589550743, "val/perplexity_len_1024": 243.50868020605293, "val/loss_avg_len_512": 5.5369287875737765, "val/perplexity_len_512": 253.8970290987337}
|
| 40 |
+
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 11550.859713669983, "val/train_update_time": 6393.710524166352, "val/loss": 5.463633170905521, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.55901667405851, "val/val_tokens_per_second": 345481.94771728665, "val/loss_avg_len_2048": 5.463633170905521, "val/perplexity_len_2048": 235.95312700310126, "val/loss_avg_len_1024": 5.49054511497196, "val/perplexity_len_1024": 242.38930088842343, "val/loss_avg_len_512": 5.532540454801778, "val/perplexity_len_512": 252.78528557959896}
|
| 41 |
+
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 11810.107688057935, "val/train_update_time": 6533.614025922609, "val/loss": 5.45942693974147, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.30903941800352, "val/val_tokens_per_second": 346211.9226180359, "val/loss_avg_len_2048": 5.45942693974147, "val/perplexity_len_2048": 234.9627379698483, "val/loss_avg_len_1024": 5.486521510563948, "val/perplexity_len_1024": 241.41598166778738, "val/loss_avg_len_512": 5.528677167296701, "val/perplexity_len_512": 251.8105873264233}
|
| 42 |
+
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 12068.55913664191, "val/train_update_time": 6673.514490786707, "val/loss": 5.455808992347491, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.4850783480797, "val/val_tokens_per_second": 345697.539057785, "val/loss_avg_len_2048": 5.455808992347491, "val/perplexity_len_2048": 234.11419106894314, "val/loss_avg_len_1024": 5.483065307796094, "val/perplexity_len_1024": 240.58303932171194, "val/loss_avg_len_512": 5.525581023516459, "val/perplexity_len_512": 251.03215123954533}
|
| 43 |
+
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 12327.182909888914, "val/train_update_time": 6813.415315568796, "val/loss": 5.453029937104025, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.3482397699263, "val/val_tokens_per_second": 346097.2472394002, "val/loss_avg_len_2048": 5.453029937104025, "val/perplexity_len_2048": 233.46447801107593, "val/loss_avg_len_1024": 5.480366042145022, "val/perplexity_len_1024": 239.93451744761228, "val/loss_avg_len_512": 5.522991253441363, "val/perplexity_len_512": 250.38287678504003}
|
| 44 |
+
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 12585.657294680947, "val/train_update_time": 6953.310590816545, "val/loss": 5.450687895292766, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.0683747169096, "val/val_tokens_per_second": 344004.02371649264, "val/loss_avg_len_2048": 5.450687895292766, "val/perplexity_len_2048": 232.918334237547, "val/loss_avg_len_1024": 5.478113070084702, "val/perplexity_len_1024": 239.39456016622754, "val/loss_avg_len_512": 5.520896892670367, "val/perplexity_len_512": 249.85903346010718}
|
| 45 |
+
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 12844.891541385907, "val/train_update_time": 7093.223455801839, "val/loss": 5.4488968286233375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.42289469996467, "val/val_tokens_per_second": 342982.8099788316, "val/loss_avg_len_2048": 5.4488968286233375, "val/perplexity_len_2048": 232.5015353411223, "val/loss_avg_len_1024": 5.4763672707772235, "val/perplexity_len_1024": 238.97698991186206, "val/loss_avg_len_512": 5.519267403276684, "val/perplexity_len_512": 249.45222235237557}
|
| 46 |
+
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 13105.045270575909, "val/train_update_time": 7233.1264379567, "val/loss": 5.447661420757801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.2398596740095, "val/val_tokens_per_second": 343509.293888644, "val/loss_avg_len_2048": 5.447661420757801, "val/perplexity_len_2048": 232.2144784682815, "val/loss_avg_len_1024": 5.475185846520326, "val/perplexity_len_1024": 238.69482341113377, "val/loss_avg_len_512": 5.51811999600099, "val/perplexity_len_512": 249.16616320206958}
|
| 47 |
+
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 13364.448509541922, "val/train_update_time": 7373.0204275009455, "val/loss": 5.446748636367329, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.28055162995588, "val/val_tokens_per_second": 343392.1074331567, "val/loss_avg_len_2048": 5.446748636367329, "val/perplexity_len_2048": 232.00261342535612, "val/loss_avg_len_1024": 5.474293929352122, "val/perplexity_len_1024": 238.48202231475486, "val/loss_avg_len_512": 5.517301595225371, "val/perplexity_len_512": 248.96232884132354}
|
| 48 |
+
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 13623.899586633896, "val/train_update_time": 7512.917904903879, "val/loss": 5.446271972442274, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 118.65777256898582, "val/val_tokens_per_second": 345194.4117372208, "val/loss_avg_len_2048": 5.446271972442274, "val/perplexity_len_2048": 231.89205250131306, "val/loss_avg_len_1024": 5.473838324508781, "val/perplexity_len_1024": 238.3733934981252, "val/loss_avg_len_512": 5.516900151270465, "val/perplexity_len_512": 248.86240447776873}
|
| 49 |
+
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 13882.689546601963, "val/train_update_time": 7652.812096997048, "val/loss": 5.446056043829591, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 119.43243380391505, "val/val_tokens_per_second": 342955.41583995847, "val/loss_avg_len_2048": 5.446056043829591, "val/perplexity_len_2048": 231.84198577773796, "val/loss_avg_len_1024": 5.473623460931872, "val/perplexity_len_1024": 238.32218124017967, "val/loss_avg_len_512": 5.516683546149684, "val/perplexity_len_512": 248.80850544420304}
|
metrics/npz/train_eval/step-000000104857600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebe2af4f21c7e0d61033e499ab1f560171c17e9433e1840361ec83d995a4e693
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b27be315fb8b50e4a1294c44349c97e37ecee415e1d751c416e137a7f019b91d
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000314572800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43650bcb683fadeca0bc902154b85c1478ddf4ba8c964c9a7c8e60133a2d4543
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24675daf5b5a30e28d02aeab0a297fc6340ddc472810d7c76fa77831f9b13628
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000524288000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e06512f1cc6a99bd126fb9509e9ec8b4e71c245058ce2c5afa7079be6c88168c
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000629145600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e569fd791b0d40a459be09259ecf03f658ef5a7548626494d29f910625cd9e0c
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000734003200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db454625a77c3c13a01875a67b4346c05c4c22769385e9abff513a392ed03fda
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000838860800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfe8ed28d202e36594e637d3e893ec7bc8a9804f862d4cc10d386991c35f5427
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000943718400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29de7c0b6c6d0a60fb84f581d2a82c5d6a2e3bce3ba2d427c983ff440bb33873
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001048576000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca5679212a6611bb7be7ed25d531e2f54a00a24248d69d75ef32be1a83597fc7
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001153433600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce50ba7c9f1f4a503794de94d3311305a62ee74cd23ccedfd16d877a5c6fcffd
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001258291200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee2dce443eb1c38a07e162379ba0638092c32d778cf325c7bfd8cc45fae6012a
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001363148800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bb7178988be17dd5cf70feb006d231677c1d56581bfe5daaabb72d7097c03c4
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001468006400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5e1750313e400d5c97e86d9af029477b578f97cec1a54d15525003e9ff97d42
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001572864000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48dc7d3a4adb3884b80a0e04cf96d710d871d6690b4ff0475e98776bea507cc6
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001677721600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c44931dbae1f81becc0e142591963ae32aec2c1342603c2ac058471f14a7330d
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001782579200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a87dabfb6d488cadb9789cc73d79fd5d94feedf35803f6dfcf616f7b12cb6c6c
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001887436800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52e60241e961a954d8c28410846a794df492a465f85a8df23a2b8cf2b56efff3
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001992294400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e2f00d1745798d0cf989730009f629f34788c0e91ab471c5a5a711ba8c71337
|
| 3 |
size 20540
|
metrics/npz/val/step-000000041943040.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:417b767d4fe37036ea1b72a1b7b75bacc99cdee5a8335eb67a2a9dc0ba618302
|
| 3 |
size 21142
|
metrics/npz/val/step-000000083886080.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bcc085766a45046f7ef30e48b25da2421887464722c53f3ea61024b334b6512
|
| 3 |
size 21142
|
metrics/npz/val/step-000000125829120.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eddf97a53501003d621a22d3fab9049de2ef47e828e4936b93287e65f15fa940
|
| 3 |
size 21142
|
metrics/npz/val/step-000000167772160.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19b40daa279004faa8e35d4ef549e09b15b591cd201f1a64d31b0ab4ef236683
|
| 3 |
size 21142
|
metrics/npz/val/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e3b720c531041ae771dd09fbb8f987216e302645e85bf5d7fa11d8109595c6a
|
| 3 |
size 21142
|
metrics/npz/val/step-000000251658240.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46738cb86d77f2dfeb34c5240c99d0994b2c21c3e628e8deaa6808052bc6c392
|
| 3 |
size 21142
|
metrics/npz/val/step-000000293601280.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:017553bdfcefb2ab3bc3d6874c2f7b842f3dab8362c02bb94f4e4e2d0aacee1f
|
| 3 |
size 21142
|
metrics/npz/val/step-000000335544320.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7674fbb1508dda2c7a9fb9eb273312bdc7ff3f13e49dae287c9700442308fdb
|
| 3 |
size 21142
|
metrics/npz/val/step-000000377487360.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da5eaa606f68afeac16287d3060ddb68697071ca07d06999c4651fabb7f73302
|
| 3 |
size 21142
|
metrics/npz/val/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96a7fd68774ed286c16d89b1c1daf90290d97796f98c646687923a9a9f7af869
|
| 3 |
size 21142
|
metrics/npz/val/step-000000461373440.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:016c8f904334cf8409d2b094468f842db1a95d80d73508dfd685cf4acca35e68
|
| 3 |
size 21142
|
metrics/npz/val/step-000000503316480.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1f7312f50a260dc0c7896b1b4b470e42caebfc963de6e73d582cb8265711181
|
| 3 |
size 21142
|
metrics/npz/val/step-000000545259520.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40b8ab66312b12f7ec52e87910df1bbbcd2e1e96fb0484c66a80964a67145eff
|
| 3 |
size 21142
|
metrics/npz/val/step-000000587202560.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2257f97e00c17e884cf0dbdccd5fb80af23c64d3f14ef51ab59848646e2c7e9c
|
| 3 |
size 21142
|
metrics/npz/val/step-000000629145600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a32f40fcc0a41995711d030ac71f9f82170d19a8b66436eb8e5361659bcd0f71
|
| 3 |
size 21142
|