add remote code + model files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- checkpoints/step-000000209715200.pt +1 -1
- checkpoints/step-000000419430400.pt +1 -1
- checkpoints/step-000000629145600.pt +1 -1
- checkpoints/step-000000838860800.pt +1 -1
- checkpoints/step-000001048576000.pt +1 -1
- checkpoints/step-000001258291200.pt +1 -1
- checkpoints/step-000001468006400.pt +1 -1
- checkpoints/step-000001677721600.pt +1 -1
- checkpoints/step-000001887436800.pt +1 -1
- logs/2025-10-25_23-30-02.log +336 -0
- metrics/jsonlines/checkpoint.jsonl +9 -9
- metrics/jsonlines/throughput.jsonl +0 -0
- metrics/jsonlines/train.jsonl +98 -98
- metrics/jsonlines/train_eval.jsonl +19 -19
- metrics/jsonlines/val.jsonl +49 -49
- metrics/npz/train_eval/step-000000104857600.npz +1 -1
- metrics/npz/train_eval/step-000000209715200.npz +1 -1
- metrics/npz/train_eval/step-000000314572800.npz +1 -1
- metrics/npz/train_eval/step-000000419430400.npz +1 -1
- metrics/npz/train_eval/step-000000524288000.npz +1 -1
- metrics/npz/train_eval/step-000000629145600.npz +1 -1
- metrics/npz/train_eval/step-000000734003200.npz +1 -1
- metrics/npz/train_eval/step-000000838860800.npz +1 -1
- metrics/npz/train_eval/step-000000943718400.npz +1 -1
- metrics/npz/train_eval/step-000001048576000.npz +1 -1
- metrics/npz/train_eval/step-000001153433600.npz +1 -1
- metrics/npz/train_eval/step-000001258291200.npz +1 -1
- metrics/npz/train_eval/step-000001363148800.npz +1 -1
- metrics/npz/train_eval/step-000001468006400.npz +1 -1
- metrics/npz/train_eval/step-000001572864000.npz +1 -1
- metrics/npz/train_eval/step-000001677721600.npz +1 -1
- metrics/npz/train_eval/step-000001782579200.npz +1 -1
- metrics/npz/train_eval/step-000001887436800.npz +1 -1
- metrics/npz/train_eval/step-000001992294400.npz +1 -1
- metrics/npz/val/step-000000041943040.npz +1 -1
- metrics/npz/val/step-000000083886080.npz +1 -1
- metrics/npz/val/step-000000125829120.npz +1 -1
- metrics/npz/val/step-000000167772160.npz +1 -1
- metrics/npz/val/step-000000209715200.npz +1 -1
- metrics/npz/val/step-000000251658240.npz +1 -1
- metrics/npz/val/step-000000293601280.npz +1 -1
- metrics/npz/val/step-000000335544320.npz +1 -1
- metrics/npz/val/step-000000377487360.npz +1 -1
- metrics/npz/val/step-000000419430400.npz +1 -1
- metrics/npz/val/step-000000461373440.npz +1 -1
- metrics/npz/val/step-000000503316480.npz +1 -1
- metrics/npz/val/step-000000545259520.npz +1 -1
- metrics/npz/val/step-000000587202560.npz +1 -1
- metrics/npz/val/step-000000629145600.npz +1 -1
- metrics/npz/val/step-000000671088640.npz +1 -1
checkpoints/step-000000209715200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c76bafdf260002e33ccc2496056b2ec692050dffb45c2ab145deda8b0c47e849
|
| 3 |
size 329410370
|
checkpoints/step-000000419430400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c48a7b84490d2d0075b35bb0e305586cadb5fbcec67d648b918f38745f445831
|
| 3 |
size 329410370
|
checkpoints/step-000000629145600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9118a021df5fba9797e8ea3041dfb88c3d060b28c2c2049c7c63003395da40a9
|
| 3 |
size 329410370
|
checkpoints/step-000000838860800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc5afe5e3a7c1be2237bc906cd232b7dcb544c5378e5a75ef84d5a558cece1a7
|
| 3 |
size 329410370
|
checkpoints/step-000001048576000.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2dcf9d8d189648809a8d7edf99c39245f16b33442591cc11bc79462e4152357b
|
| 3 |
size 329410370
|
checkpoints/step-000001258291200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c16a5b164d28e547c299dffe98636483e32b8c04c0c926cf9c71d8e1d5215cf
|
| 3 |
size 329410370
|
checkpoints/step-000001468006400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7ac2eb93bb1ef7c0bc025c2855a12385ab5e1e3d5b34e96a0ccd15ae587a74e
|
| 3 |
size 329410370
|
checkpoints/step-000001677721600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a2884dd89eb6202c9f0b572e5eaaec3aea12732f0df686fe07dbebe031aad3c
|
| 3 |
size 329410370
|
checkpoints/step-000001887436800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410370
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b36a74cfc772001dd3cb75c15e507d806eaaf651063a658b23c93a08c580b57c
|
| 3 |
size 329410370
|
logs/2025-10-25_23-30-02.log
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-10-25 23:30:02][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256`
|
| 2 |
+
[2025-10-25 23:30:02][train:375][INFO] Configuration:
|
| 3 |
+
[2025-10-25 23:30:02][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml.
|
| 4 |
+
[2025-10-25 23:30:02][train:387][INFO] creating datamodule
|
| 5 |
+
[2025-10-25 23:30:02][train:419][INFO] creating model
|
| 6 |
+
[2025-10-25 23:30:03][train:440][INFO] creating optimizer
|
| 7 |
+
[2025-10-25 23:30:03][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
|
| 8 |
+
[2025-10-25 23:30:03][logger:256][INFO] Setting up wandb logger...
|
| 9 |
+
[2025-10-25 23:30:03][logger:272][INFO] Not resuming. Creating a new wandb run.
|
| 10 |
+
[2025-10-25 23:30:04][logger:288][INFO] wandb initialized. Run id: t14066ds
|
| 11 |
+
[2025-10-25 23:30:04][logger:186][INFO] Setting up jsonlines logger...
|
| 12 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/resume.jsonl since we are not resuming
|
| 13 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train_data_info.jsonl since we are not resuming
|
| 14 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/val_data_info.jsonl since we are not resuming
|
| 15 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/model_info.jsonl since we are not resuming
|
| 16 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train.jsonl since we are not resuming
|
| 17 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/throughput.jsonl since we are not resuming
|
| 18 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/norm.jsonl since we are not resuming
|
| 19 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/val.jsonl since we are not resuming
|
| 20 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train_eval.jsonl since we are not resuming
|
| 21 |
+
[2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/checkpoint.jsonl since we are not resuming
|
| 22 |
+
[2025-10-25 23:30:04][logger:113][INFO] Setting up npz logger...
|
| 23 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000041943040.npz since we are not resuming
|
| 24 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000083886080.npz since we are not resuming
|
| 25 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000125829120.npz since we are not resuming
|
| 26 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000167772160.npz since we are not resuming
|
| 27 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000209715200.npz since we are not resuming
|
| 28 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000251658240.npz since we are not resuming
|
| 29 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000293601280.npz since we are not resuming
|
| 30 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000335544320.npz since we are not resuming
|
| 31 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000377487360.npz since we are not resuming
|
| 32 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000419430400.npz since we are not resuming
|
| 33 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000461373440.npz since we are not resuming
|
| 34 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000503316480.npz since we are not resuming
|
| 35 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000545259520.npz since we are not resuming
|
| 36 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000587202560.npz since we are not resuming
|
| 37 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000629145600.npz since we are not resuming
|
| 38 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000671088640.npz since we are not resuming
|
| 39 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000713031680.npz since we are not resuming
|
| 40 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000754974720.npz since we are not resuming
|
| 41 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000796917760.npz since we are not resuming
|
| 42 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000838860800.npz since we are not resuming
|
| 43 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000880803840.npz since we are not resuming
|
| 44 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000922746880.npz since we are not resuming
|
| 45 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000964689920.npz since we are not resuming
|
| 46 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001006632960.npz since we are not resuming
|
| 47 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001048576000.npz since we are not resuming
|
| 48 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001090519040.npz since we are not resuming
|
| 49 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001132462080.npz since we are not resuming
|
| 50 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001174405120.npz since we are not resuming
|
| 51 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001216348160.npz since we are not resuming
|
| 52 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001258291200.npz since we are not resuming
|
| 53 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001300234240.npz since we are not resuming
|
| 54 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001342177280.npz since we are not resuming
|
| 55 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001384120320.npz since we are not resuming
|
| 56 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001426063360.npz since we are not resuming
|
| 57 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001468006400.npz since we are not resuming
|
| 58 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001509949440.npz since we are not resuming
|
| 59 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001551892480.npz since we are not resuming
|
| 60 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001593835520.npz since we are not resuming
|
| 61 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001635778560.npz since we are not resuming
|
| 62 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001677721600.npz since we are not resuming
|
| 63 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001719664640.npz since we are not resuming
|
| 64 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001761607680.npz since we are not resuming
|
| 65 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001803550720.npz since we are not resuming
|
| 66 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001845493760.npz since we are not resuming
|
| 67 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001887436800.npz since we are not resuming
|
| 68 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001929379840.npz since we are not resuming
|
| 69 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001971322880.npz since we are not resuming
|
| 70 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000002013265920.npz since we are not resuming
|
| 71 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000002055208960.npz since we are not resuming
|
| 72 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000104857600.npz since we are not resuming
|
| 73 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000209715200.npz since we are not resuming
|
| 74 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000314572800.npz since we are not resuming
|
| 75 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000419430400.npz since we are not resuming
|
| 76 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000524288000.npz since we are not resuming
|
| 77 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000629145600.npz since we are not resuming
|
| 78 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000734003200.npz since we are not resuming
|
| 79 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000838860800.npz since we are not resuming
|
| 80 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000943718400.npz since we are not resuming
|
| 81 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001048576000.npz since we are not resuming
|
| 82 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001153433600.npz since we are not resuming
|
| 83 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001258291200.npz since we are not resuming
|
| 84 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001363148800.npz since we are not resuming
|
| 85 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001468006400.npz since we are not resuming
|
| 86 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001572864000.npz since we are not resuming
|
| 87 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001677721600.npz since we are not resuming
|
| 88 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001782579200.npz since we are not resuming
|
| 89 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001887436800.npz since we are not resuming
|
| 90 |
+
[2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001992294400.npz since we are not resuming
|
| 91 |
+
[2025-10-25 23:30:04][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
|
| 92 |
+
[2025-10-25 23:30:04][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
|
| 93 |
+
[2025-10-25 23:30:04][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128]
|
| 94 |
+
[2025-10-25 23:31:17][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:01:13] [ETA: 2:00:37] [loss: 9.762] [tokens/s: 302926.081] [batches/s: 0.144] [MFU: 0.000] [TFLOPS: 0.000]
|
| 95 |
+
[2025-10-25 23:32:26][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:02:22] [ETA: 1:56:11] [loss: 8.127] [tokens/s: 303066.016] [batches/s: 0.145] [MFU: 0.000] [TFLOPS: 0.000]
|
| 96 |
+
[2025-10-25 23:32:26][train:194][INFO] Running validation...
|
| 97 |
+
[2025-10-25 23:33:57][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 142.274] [val/train_update_time: 141.950] [val/loss: 8.017] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.296] [val/val_tokens_per_second: 453619.578] [val/loss_avg_len_2048: 8.017] [val/perplexity_len_2048: 3033.047] [val/loss_avg_len_1024: 8.016] [val/perplexity_len_1024: 3029.391] [val/loss_avg_len_512: 8.017] [val/perplexity_len_512: 3030.800]
|
| 98 |
+
[2025-10-25 23:35:06][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:05:01] [ETA: 2:42:35] [loss: 7.520] [tokens/s: 209051.322] [batches/s: 0.100] [MFU: 0.000] [TFLOPS: 0.000]
|
| 99 |
+
[2025-10-25 23:36:15][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:06:10] [ETA: 2:28:20] [loss: 7.193] [tokens/s: 227164.362] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000]
|
| 100 |
+
[2025-10-25 23:36:15][train:194][INFO] Running validation...
|
| 101 |
+
[2025-10-25 23:37:45][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 370.841] [val/train_update_time: 279.983] [val/loss: 7.169] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.330] [val/val_tokens_per_second: 453450.719] [val/loss_avg_len_2048: 7.169] [val/perplexity_len_2048: 1298.380] [val/loss_avg_len_1024: 7.169] [val/perplexity_len_1024: 1298.934] [val/loss_avg_len_512: 7.173] [val/perplexity_len_512: 1303.240]
|
| 102 |
+
[2025-10-25 23:38:54][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:08:50] [ETA: 2:47:55] [loss: 6.947] [tokens/s: 197807.885] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 103 |
+
[2025-10-25 23:38:54][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 530.295] [train_eval/train_update_time: 348.975] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.263] [train_eval/perplexity_len_2048: 3876.799] [train_eval/loss_avg_len_1024: 8.264] [train_eval/perplexity_len_1024: 3880.087] [train_eval/loss_avg_len_512: 8.264] [train_eval/perplexity_len_512: 3883.218]
|
| 104 |
+
[2025-10-25 23:40:03][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:09:59] [ETA: 2:36:30] [loss: 6.680] [tokens/s: 210212.973] [batches/s: 0.100] [MFU: 0.000] [TFLOPS: 0.000]
|
| 105 |
+
[2025-10-25 23:40:03][train:194][INFO] Running validation...
|
| 106 |
+
[2025-10-25 23:41:34][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 599.401] [val/train_update_time: 417.947] [val/loss: 6.680] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.348] [val/val_tokens_per_second: 453356.595] [val/loss_avg_len_2048: 6.680] [val/perplexity_len_2048: 796.683] [val/loss_avg_len_1024: 6.682] [val/perplexity_len_1024: 797.888] [val/loss_avg_len_512: 6.688] [val/perplexity_len_512: 802.766]
|
| 107 |
+
[2025-10-25 23:42:43][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:12:38] [ETA: 2:48:02] [loss: 6.480] [tokens/s: 193430.688] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
|
| 108 |
+
[2025-10-25 23:43:52][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:13:47] [ETA: 2:38:41] [loss: 6.282] [tokens/s: 202736.609] [batches/s: 0.097] [MFU: 0.000] [TFLOPS: 0.000]
|
| 109 |
+
[2025-10-25 23:43:52][train:194][INFO] Running validation...
|
| 110 |
+
[2025-10-25 23:45:22][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 827.992] [val/train_update_time: 555.907] [val/loss: 6.256] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.297] [val/val_tokens_per_second: 453613.474] [val/loss_avg_len_2048: 6.256] [val/perplexity_len_2048: 521.387] [val/loss_avg_len_1024: 6.259] [val/perplexity_len_1024: 522.894] [val/loss_avg_len_512: 6.268] [val/perplexity_len_512: 527.534]
|
| 111 |
+
[2025-10-25 23:46:31][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:16:27] [ETA: 2:46:24] [loss: 6.123] [tokens/s: 191111.243] [batches/s: 0.091] [MFU: 0.000] [TFLOPS: 0.000]
|
| 112 |
+
[2025-10-25 23:47:41][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:17:36] [ETA: 2:38:28] [loss: 5.972] [tokens/s: 198537.821] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 113 |
+
[2025-10-25 23:47:41][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1056.534] [train_eval/train_update_time: 693.881] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.399] [train_eval/perplexity_len_2048: 601.304] [train_eval/loss_avg_len_1024: 6.403] [train_eval/perplexity_len_1024: 603.875] [train_eval/loss_avg_len_512: 6.410] [train_eval/perplexity_len_512: 607.701]
|
| 114 |
+
[2025-10-25 23:47:41][train:194][INFO] Running validation...
|
| 115 |
+
[2025-10-25 23:49:11][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 1056.534] [val/train_update_time: 693.881] [val/loss: 5.960] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.285] [val/val_tokens_per_second: 453676.331] [val/loss_avg_len_2048: 5.960] [val/perplexity_len_2048: 387.490] [val/loss_avg_len_1024: 5.964] [val/perplexity_len_1024: 389.067] [val/loss_avg_len_512: 5.975] [val/perplexity_len_512: 393.380]
|
| 116 |
+
[2025-10-25 23:49:11][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt...
|
| 117 |
+
[2025-10-25 23:49:11][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt.
|
| 118 |
+
[2025-10-25 23:49:11][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.425]
|
| 119 |
+
[2025-10-25 23:50:20][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:20:16] [ETA: 2:44:01] [loss: 5.850] [tokens/s: 182711.177] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 120 |
+
[2025-10-25 23:51:29][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:21:25] [ETA: 2:37:06] [loss: 5.718] [tokens/s: 198492.689] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 121 |
+
[2025-10-25 23:51:29][train:194][INFO] Running validation...
|
| 122 |
+
[2025-10-25 23:53:00][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1285.478] [val/train_update_time: 831.841] [val/loss: 5.730] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.295] [val/val_tokens_per_second: 453622.098] [val/loss_avg_len_2048: 5.730] [val/perplexity_len_2048: 307.853] [val/loss_avg_len_1024: 5.735] [val/perplexity_len_1024: 309.410] [val/loss_avg_len_512: 5.747] [val/perplexity_len_512: 313.342]
|
| 123 |
+
[2025-10-25 23:54:09][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:24:04] [ETA: 2:41:09] [loss: 5.644] [tokens/s: 182721.034] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 124 |
+
[2025-10-25 23:55:18][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:25:14] [ETA: 2:35:00] [loss: 5.570] [tokens/s: 198502.780] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 125 |
+
[2025-10-25 23:55:18][train:194][INFO] Running validation...
|
| 126 |
+
[2025-10-25 23:56:49][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1514.021] [val/train_update_time: 969.814] [val/loss: 5.542] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.563] [val/val_tokens_per_second: 452284.018] [val/loss_avg_len_2048: 5.542] [val/perplexity_len_2048: 255.165] [val/loss_avg_len_1024: 5.548] [val/perplexity_len_1024: 256.708] [val/loss_avg_len_512: 5.562] [val/perplexity_len_512: 260.306]
|
| 127 |
+
[2025-10-25 23:57:58][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:27:53] [ETA: 2:38:04] [loss: 5.445] [tokens/s: 182680.714] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 128 |
+
[2025-10-25 23:57:58][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1673.720] [train_eval/train_update_time: 1038.809] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.693] [train_eval/perplexity_len_2048: 296.814] [train_eval/loss_avg_len_1024: 5.699] [train_eval/perplexity_len_1024: 298.566] [train_eval/loss_avg_len_512: 5.711] [train_eval/perplexity_len_512: 302.082]
|
| 129 |
+
[2025-10-25 23:59:07][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:29:02] [ETA: 2:32:29] [loss: 5.405] [tokens/s: 198459.104] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 130 |
+
[2025-10-25 23:59:07][train:194][INFO] Running validation...
|
| 131 |
+
[2025-10-26 00:00:37][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1742.828] [val/train_update_time: 1107.798] [val/loss: 5.396] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.408] [val/val_tokens_per_second: 453055.432] [val/loss_avg_len_2048: 5.396] [val/perplexity_len_2048: 220.467] [val/loss_avg_len_1024: 5.403] [val/perplexity_len_1024: 222.034] [val/loss_avg_len_512: 5.418] [val/perplexity_len_512: 225.426]
|
| 132 |
+
[2025-10-26 00:01:46][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:31:42] [ETA: 2:34:47] [loss: 5.298] [tokens/s: 182674.312] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 133 |
+
[2025-10-26 00:02:55][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:32:51] [ETA: 2:29:41] [loss: 5.267] [tokens/s: 198435.832] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 134 |
+
[2025-10-26 00:02:55][train:194][INFO] Running validation...
|
| 135 |
+
[2025-10-26 00:04:26][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1971.492] [val/train_update_time: 1245.788] [val/loss: 5.258] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.322] [val/val_tokens_per_second: 453486.621] [val/loss_avg_len_2048: 5.258] [val/perplexity_len_2048: 192.005] [val/loss_avg_len_1024: 5.266] [val/perplexity_len_1024: 193.543] [val/loss_avg_len_512: 5.282] [val/perplexity_len_512: 196.771]
|
| 136 |
+
[2025-10-26 00:05:35][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:35:30] [ETA: 2:31:24] [loss: 5.241] [tokens/s: 182665.333] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 137 |
+
[2025-10-26 00:06:44][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:36:40] [ETA: 2:26:40] [loss: 5.149] [tokens/s: 198504.975] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 138 |
+
[2025-10-26 00:06:44][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2200.083] [train_eval/train_update_time: 1383.789] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.297] [train_eval/perplexity_len_2048: 199.721] [train_eval/loss_avg_len_1024: 5.305] [train_eval/perplexity_len_1024: 201.409] [train_eval/loss_avg_len_512: 5.320] [train_eval/perplexity_len_512: 204.484]
|
| 139 |
+
[2025-10-26 00:06:44][train:194][INFO] Running validation...
|
| 140 |
+
[2025-10-26 00:08:15][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 2200.083] [val/train_update_time: 1383.789] [val/loss: 5.151] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.410] [val/val_tokens_per_second: 453049.139] [val/loss_avg_len_2048: 5.151] [val/perplexity_len_2048: 172.553] [val/loss_avg_len_1024: 5.159] [val/perplexity_len_1024: 174.054] [val/loss_avg_len_512: 5.177] [val/perplexity_len_512: 177.220]
|
| 141 |
+
[2025-10-26 00:08:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt...
|
| 142 |
+
[2025-10-26 00:08:15][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt.
|
| 143 |
+
[2025-10-26 00:08:15][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.421]
|
| 144 |
+
[2025-10-26 00:09:24][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:39:20] [ETA: 2:27:58] [loss: 5.102] [tokens/s: 182638.988] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 145 |
+
[2025-10-26 00:10:33][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:40:29] [ETA: 2:23:32] [loss: 5.073] [tokens/s: 198400.131] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 146 |
+
[2025-10-26 00:10:33][train:194][INFO] Running validation...
|
| 147 |
+
[2025-10-26 00:12:04][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2429.164] [val/train_update_time: 1521.777] [val/loss: 5.064] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.792] [val/val_tokens_per_second: 451142.338] [val/loss_avg_len_2048: 5.064] [val/perplexity_len_2048: 158.159] [val/loss_avg_len_1024: 5.073] [val/perplexity_len_1024: 159.659] [val/loss_avg_len_512: 5.092] [val/perplexity_len_512: 162.754]
|
| 148 |
+
[2025-10-26 00:13:13][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:43:09] [ETA: 2:24:27] [loss: 5.013] [tokens/s: 182557.417] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 149 |
+
[2025-10-26 00:14:22][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:44:18] [ETA: 2:20:17] [loss: 4.965] [tokens/s: 198351.326] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 150 |
+
[2025-10-26 00:14:22][train:194][INFO] Running validation...
|
| 151 |
+
[2025-10-26 00:15:53][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2658.232] [val/train_update_time: 1659.785] [val/loss: 4.985] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 91.045] [val/val_tokens_per_second: 449885.595] [val/loss_avg_len_2048: 4.985] [val/perplexity_len_2048: 146.276] [val/loss_avg_len_1024: 4.996] [val/perplexity_len_1024: 147.785] [val/loss_avg_len_512: 5.016] [val/perplexity_len_512: 150.831]
|
| 152 |
+
[2025-10-26 00:17:02][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:46:58] [ETA: 2:20:55] [loss: 4.962] [tokens/s: 182476.527] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 153 |
+
[2025-10-26 00:17:02][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2818.422] [train_eval/train_update_time: 1728.795] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.045] [train_eval/perplexity_len_2048: 155.292] [train_eval/loss_avg_len_1024: 5.053] [train_eval/perplexity_len_1024: 156.556] [train_eval/loss_avg_len_512: 5.071] [train_eval/perplexity_len_512: 159.272]
|
| 154 |
+
[2025-10-26 00:18:12][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:48:07] [ETA: 2:16:58] [loss: 4.912] [tokens/s: 198222.713] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 155 |
+
[2025-10-26 00:18:12][train:194][INFO] Running validation...
|
| 156 |
+
[2025-10-26 00:19:42][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2887.564] [val/train_update_time: 1797.818] [val/loss: 4.916] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.709] [val/val_tokens_per_second: 451552.828] [val/loss_avg_len_2048: 4.916] [val/perplexity_len_2048: 136.521] [val/loss_avg_len_1024: 4.927] [val/perplexity_len_1024: 137.983] [val/loss_avg_len_512: 4.948] [val/perplexity_len_512: 140.922]
|
| 157 |
+
[2025-10-26 00:20:51][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:50:47] [ETA: 2:17:19] [loss: 4.898] [tokens/s: 182422.584] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 158 |
+
[2025-10-26 00:22:01][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:51:56] [ETA: 2:13:33] [loss: 4.849] [tokens/s: 198153.306] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 159 |
+
[2025-10-26 00:22:01][train:194][INFO] Running validation...
|
| 160 |
+
[2025-10-26 00:23:31][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 3116.515] [val/train_update_time: 1935.813] [val/loss: 4.863] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.400] [val/val_tokens_per_second: 453094.867] [val/loss_avg_len_2048: 4.863] [val/perplexity_len_2048: 129.424] [val/loss_avg_len_1024: 4.874] [val/perplexity_len_1024: 130.908] [val/loss_avg_len_512: 4.897] [val/perplexity_len_512: 133.851]
|
| 161 |
+
[2025-10-26 00:24:40][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:54:36] [ETA: 2:13:40] [loss: 4.817] [tokens/s: 182416.353] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 162 |
+
[2025-10-26 00:25:49][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:55:45] [ETA: 2:10:05] [loss: 4.797] [tokens/s: 198238.394] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 163 |
+
[2025-10-26 00:25:49][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3345.162] [train_eval/train_update_time: 2073.822] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.881] [train_eval/perplexity_len_2048: 131.799] [train_eval/loss_avg_len_1024: 4.889] [train_eval/perplexity_len_1024: 132.870] [train_eval/loss_avg_len_512: 4.908] [train_eval/perplexity_len_512: 135.408]
|
| 164 |
+
[2025-10-26 00:25:49][train:194][INFO] Running validation...
|
| 165 |
+
[2025-10-26 00:27:19][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3345.162] [val/train_update_time: 2073.822] [val/loss: 4.812] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.229] [val/val_tokens_per_second: 453957.229] [val/loss_avg_len_2048: 4.812] [val/perplexity_len_2048: 122.919] [val/loss_avg_len_1024: 4.823] [val/perplexity_len_1024: 124.363] [val/loss_avg_len_512: 4.846] [val/perplexity_len_512: 127.252]
|
| 166 |
+
[2025-10-26 00:27:19][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt...
|
| 167 |
+
[2025-10-26 00:27:20][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt.
|
| 168 |
+
[2025-10-26 00:27:20][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.417]
|
| 169 |
+
[2025-10-26 00:28:29][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:58:24] [ETA: 2:10:01] [loss: 4.811] [tokens/s: 182446.882] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 170 |
+
[2025-10-26 00:29:38][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:59:34] [ETA: 2:06:34] [loss: 4.743] [tokens/s: 198268.603] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 171 |
+
[2025-10-26 00:29:38][train:194][INFO] Running validation...
|
| 172 |
+
[2025-10-26 00:31:09][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3574.040] [val/train_update_time: 2211.816] [val/loss: 4.761] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.520] [val/val_tokens_per_second: 452498.071] [val/loss_avg_len_2048: 4.761] [val/perplexity_len_2048: 116.815] [val/loss_avg_len_1024: 4.773] [val/perplexity_len_1024: 118.254] [val/loss_avg_len_512: 4.797] [val/perplexity_len_512: 121.108]
|
| 173 |
+
[2025-10-26 00:32:18][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:02:13] [ETA: 2:06:20] [loss: 4.760] [tokens/s: 182492.029] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 174 |
+
[2025-10-26 00:33:27][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:03:22] [ETA: 2:03:01] [loss: 4.725] [tokens/s: 198368.660] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 175 |
+
[2025-10-26 00:33:27][train:194][INFO] Running validation...
|
| 176 |
+
[2025-10-26 00:34:57][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3802.836] [val/train_update_time: 2349.827] [val/loss: 4.719] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.552] [val/val_tokens_per_second: 452336.201] [val/loss_avg_len_2048: 4.719] [val/perplexity_len_2048: 112.082] [val/loss_avg_len_1024: 4.732] [val/perplexity_len_1024: 113.528] [val/loss_avg_len_512: 4.757] [val/perplexity_len_512: 116.347]
|
| 177 |
+
[2025-10-26 00:36:07][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:06:02] [ETA: 2:02:38] [loss: 4.704] [tokens/s: 182574.869] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 178 |
+
[2025-10-26 00:36:07][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3962.515] [train_eval/train_update_time: 2418.827] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.753] [train_eval/perplexity_len_2048: 115.879] [train_eval/loss_avg_len_1024: 4.764] [train_eval/perplexity_len_1024: 117.193] [train_eval/loss_avg_len_512: 4.786] [train_eval/perplexity_len_512: 119.779]
|
| 179 |
+
[2025-10-26 00:37:16][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:07:11] [ETA: 1:59:27] [loss: 4.631] [tokens/s: 198404.891] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 180 |
+
[2025-10-26 00:37:16][train:194][INFO] Running validation...
|
| 181 |
+
[2025-10-26 00:38:46][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 4031.641] [val/train_update_time: 2487.827] [val/loss: 4.676] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.490] [val/val_tokens_per_second: 452647.181] [val/loss_avg_len_2048: 4.676] [val/perplexity_len_2048: 107.379] [val/loss_avg_len_1024: 4.690] [val/perplexity_len_1024: 108.835] [val/loss_avg_len_512: 4.715] [val/perplexity_len_512: 111.660]
|
| 182 |
+
[2025-10-26 00:39:55][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:09:51] [ETA: 1:58:56] [loss: 4.656] [tokens/s: 182612.630] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 183 |
+
[2025-10-26 00:41:04][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:11:00] [ETA: 1:55:51] [loss: 4.634] [tokens/s: 198387.213] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 184 |
+
[2025-10-26 00:41:04][train:194][INFO] Running validation...
|
| 185 |
+
[2025-10-26 00:42:35][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 4260.374] [val/train_update_time: 2625.819] [val/loss: 4.640] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.780] [val/val_tokens_per_second: 451199.036] [val/loss_avg_len_2048: 4.640] [val/perplexity_len_2048: 103.564] [val/loss_avg_len_1024: 4.654] [val/perplexity_len_1024: 105.014] [val/loss_avg_len_512: 4.680] [val/perplexity_len_512: 107.815]
|
| 186 |
+
[2025-10-26 00:43:44][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:13:40] [ETA: 1:55:13] [loss: 4.640] [tokens/s: 182547.275] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 187 |
+
[2025-10-26 00:44:53][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:14:49] [ETA: 1:52:14] [loss: 4.548] [tokens/s: 198354.370] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 188 |
+
[2025-10-26 00:44:53][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4489.443] [train_eval/train_update_time: 2763.837] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.653] [train_eval/perplexity_len_2048: 104.915] [train_eval/loss_avg_len_1024: 4.664] [train_eval/perplexity_len_1024: 106.073] [train_eval/loss_avg_len_512: 4.688] [train_eval/perplexity_len_512: 108.620]
|
| 189 |
+
[2025-10-26 00:44:53][train:194][INFO] Running validation...
|
| 190 |
+
[2025-10-26 00:46:24][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4489.443] [val/train_update_time: 2763.837] [val/loss: 4.608] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.745] [val/val_tokens_per_second: 451376.404] [val/loss_avg_len_2048: 4.608] [val/perplexity_len_2048: 100.291] [val/loss_avg_len_1024: 4.623] [val/perplexity_len_1024: 101.767] [val/loss_avg_len_512: 4.650] [val/perplexity_len_512: 104.609]
|
| 191 |
+
[2025-10-26 00:46:24][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt...
|
| 192 |
+
[2025-10-26 00:46:25][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt.
|
| 193 |
+
[2025-10-26 00:46:25][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.428]
|
| 194 |
+
[2025-10-26 00:47:34][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:17:29] [ETA: 1:51:31] [loss: 4.572] [tokens/s: 182455.996] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 195 |
+
[2025-10-26 00:48:43][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:18:38] [ETA: 1:48:36] [loss: 4.565] [tokens/s: 198225.170] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 196 |
+
[2025-10-26 00:48:43][train:194][INFO] Running validation...
|
| 197 |
+
[2025-10-26 00:50:14][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4718.872] [val/train_update_time: 2901.840] [val/loss: 4.577] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.859] [val/val_tokens_per_second: 450810.091] [val/loss_avg_len_2048: 4.577] [val/perplexity_len_2048: 97.256] [val/loss_avg_len_1024: 4.593] [val/perplexity_len_1024: 98.753] [val/loss_avg_len_512: 4.621] [val/perplexity_len_512: 101.602]
|
| 198 |
+
[2025-10-26 00:51:23][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:21:18] [ETA: 1:47:47] [loss: 4.560] [tokens/s: 182399.733] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 199 |
+
[2025-10-26 00:52:32][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:22:27] [ETA: 1:44:57] [loss: 4.587] [tokens/s: 198169.280] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 200 |
+
[2025-10-26 00:52:32][train:194][INFO] Running validation...
|
| 201 |
+
[2025-10-26 00:54:03][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 4947.998] [val/train_update_time: 3039.860] [val/loss: 4.550] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.510] [val/val_tokens_per_second: 452545.653] [val/loss_avg_len_2048: 4.550] [val/perplexity_len_2048: 94.613] [val/loss_avg_len_1024: 4.566] [val/perplexity_len_1024: 96.111] [val/loss_avg_len_512: 4.595] [val/perplexity_len_512: 98.972]
|
| 202 |
+
[2025-10-26 00:55:12][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:25:07] [ETA: 1:44:02] [loss: 4.528] [tokens/s: 182405.880] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 203 |
+
[2025-10-26 00:55:12][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5107.660] [train_eval/train_update_time: 3108.876] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.569] [train_eval/perplexity_len_2048: 96.495] [train_eval/loss_avg_len_1024: 4.584] [train_eval/perplexity_len_1024: 97.918] [train_eval/loss_avg_len_512: 4.612] [train_eval/perplexity_len_512: 100.712]
|
| 204 |
+
[2025-10-26 00:56:21][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:26:16] [ETA: 1:41:17] [loss: 4.497] [tokens/s: 198142.855] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 205 |
+
[2025-10-26 00:56:21][train:194][INFO] Running validation...
|
| 206 |
+
[2025-10-26 00:57:52][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 5176.881] [val/train_update_time: 3177.900] [val/loss: 4.520] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.758] [val/val_tokens_per_second: 451311.936] [val/loss_avg_len_2048: 4.520] [val/perplexity_len_2048: 91.875] [val/loss_avg_len_1024: 4.537] [val/perplexity_len_1024: 93.391] [val/loss_avg_len_512: 4.567] [val/perplexity_len_512: 96.279]
|
| 207 |
+
[2025-10-26 00:59:01][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:28:56] [ETA: 1:40:18] [loss: 4.519] [tokens/s: 182344.566] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 208 |
+
[2025-10-26 01:00:10][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:30:05] [ETA: 1:37:36] [loss: 4.495] [tokens/s: 198140.860] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 209 |
+
[2025-10-26 01:00:10][train:194][INFO] Running validation...
|
| 210 |
+
[2025-10-26 01:01:41][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 5405.913] [val/train_update_time: 3315.925] [val/loss: 4.493] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.736] [val/val_tokens_per_second: 451418.210] [val/loss_avg_len_2048: 4.493] [val/perplexity_len_2048: 89.360] [val/loss_avg_len_1024: 4.510] [val/perplexity_len_1024: 90.887] [val/loss_avg_len_512: 4.541] [val/perplexity_len_512: 93.798]
|
| 211 |
+
[2025-10-26 01:02:50][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:32:45] [ETA: 1:36:32] [loss: 4.491] [tokens/s: 182349.959] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 212 |
+
[2025-10-26 01:03:59][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:33:54] [ETA: 1:33:54] [loss: 4.464] [tokens/s: 198227.583] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 213 |
+
[2025-10-26 01:03:59][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5634.922] [train_eval/train_update_time: 3453.965] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.500] [train_eval/perplexity_len_2048: 90.027] [train_eval/loss_avg_len_1024: 4.515] [train_eval/perplexity_len_1024: 91.341] [train_eval/loss_avg_len_512: 4.545] [train_eval/perplexity_len_512: 94.162]
|
| 214 |
+
[2025-10-26 01:03:59][train:194][INFO] Running validation...
|
| 215 |
+
[2025-10-26 01:05:29][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5634.922] [val/train_update_time: 3453.965] [val/loss: 4.469] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.435] [val/val_tokens_per_second: 452922.884] [val/loss_avg_len_2048: 4.469] [val/perplexity_len_2048: 87.242] [val/loss_avg_len_1024: 4.486] [val/perplexity_len_1024: 88.788] [val/loss_avg_len_512: 4.519] [val/perplexity_len_512: 91.727]
|
| 216 |
+
[2025-10-26 01:05:29][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt...
|
| 217 |
+
[2025-10-26 01:05:30][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt.
|
| 218 |
+
[2025-10-26 01:05:30][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.413]
|
| 219 |
+
[2025-10-26 01:06:39][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:36:34] [ETA: 1:32:47] [loss: 4.463] [tokens/s: 182402.852] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 220 |
+
[2025-10-26 01:07:48][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:37:44] [ETA: 1:30:12] [loss: 4.457] [tokens/s: 198223.784] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 221 |
+
[2025-10-26 01:07:48][train:194][INFO] Running validation...
|
| 222 |
+
[2025-10-26 01:09:19][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 5864.051] [val/train_update_time: 3591.998] [val/loss: 4.446] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.546] [val/val_tokens_per_second: 452366.332] [val/loss_avg_len_2048: 4.446] [val/perplexity_len_2048: 85.290] [val/loss_avg_len_1024: 4.464] [val/perplexity_len_1024: 86.871] [val/loss_avg_len_512: 4.498] [val/perplexity_len_512: 89.857]
|
| 223 |
+
[2025-10-26 01:10:28][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:40:23] [ETA: 1:29:01] [loss: 4.414] [tokens/s: 182452.528] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 224 |
+
[2025-10-26 01:11:37][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:41:32] [ETA: 1:26:30] [loss: 4.424] [tokens/s: 198219.905] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 225 |
+
[2025-10-26 01:11:37][train:194][INFO] Running validation...
|
| 226 |
+
[2025-10-26 01:13:07][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 6092.851] [val/train_update_time: 3730.021] [val/loss: 4.420] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.329] [val/val_tokens_per_second: 453455.804] [val/loss_avg_len_2048: 4.420] [val/perplexity_len_2048: 83.101] [val/loss_avg_len_1024: 4.439] [val/perplexity_len_1024: 84.710] [val/loss_avg_len_512: 4.474] [val/perplexity_len_512: 87.723]
|
| 227 |
+
[2025-10-26 01:14:16][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:44:12] [ETA: 1:25:15] [loss: 4.370] [tokens/s: 182483.006] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 228 |
+
[2025-10-26 01:14:16][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6252.331] [train_eval/train_update_time: 3799.045] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.434] [train_eval/perplexity_len_2048: 84.302] [train_eval/loss_avg_len_1024: 4.449] [train_eval/perplexity_len_1024: 85.517] [train_eval/loss_avg_len_512: 4.480] [train_eval/perplexity_len_512: 88.204]
|
| 229 |
+
[2025-10-26 01:15:25][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:45:21] [ETA: 1:22:46] [loss: 4.414] [tokens/s: 198315.814] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 230 |
+
[2025-10-26 01:15:25][train:194][INFO] Running validation...
|
| 231 |
+
[2025-10-26 01:16:56][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 6321.474] [val/train_update_time: 3868.057] [val/loss: 4.398] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.346] [val/val_tokens_per_second: 453369.764] [val/loss_avg_len_2048: 4.398] [val/perplexity_len_2048: 81.305] [val/loss_avg_len_1024: 4.418] [val/perplexity_len_1024: 82.939] [val/loss_avg_len_512: 4.454] [val/perplexity_len_512: 86.012]
|
| 232 |
+
[2025-10-26 01:18:05][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:48:00] [ETA: 1:21:29] [loss: 4.362] [tokens/s: 182558.830] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 233 |
+
[2025-10-26 01:19:14][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:49:10] [ETA: 1:19:03] [loss: 4.410] [tokens/s: 198388.831] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 234 |
+
[2025-10-26 01:19:14][train:194][INFO] Running validation...
|
| 235 |
+
[2025-10-26 01:20:44][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 6550.098] [val/train_update_time: 4006.073] [val/loss: 4.376] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.329] [val/val_tokens_per_second: 453451.140] [val/loss_avg_len_2048: 4.376] [val/perplexity_len_2048: 79.530] [val/loss_avg_len_1024: 4.397] [val/perplexity_len_1024: 81.202] [val/loss_avg_len_512: 4.435] [val/perplexity_len_512: 84.355]
|
| 236 |
+
[2025-10-26 01:21:54][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:51:49] [ETA: 1:17:42] [loss: 4.391] [tokens/s: 182619.459] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 237 |
+
[2025-10-26 01:23:03][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:52:58] [ETA: 1:15:19] [loss: 4.391] [tokens/s: 198482.788] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 238 |
+
[2025-10-26 01:23:03][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6778.730] [train_eval/train_update_time: 4144.121] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.379] [train_eval/perplexity_len_2048: 79.720] [train_eval/loss_avg_len_1024: 4.394] [train_eval/perplexity_len_1024: 80.957] [train_eval/loss_avg_len_512: 4.429] [train_eval/perplexity_len_512: 83.854]
|
| 239 |
+
[2025-10-26 01:23:03][train:194][INFO] Running validation...
|
| 240 |
+
[2025-10-26 01:24:33][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6778.730] [val/train_update_time: 4144.121] [val/loss: 4.356] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.371] [val/val_tokens_per_second: 453242.896] [val/loss_avg_len_2048: 4.356] [val/perplexity_len_2048: 77.911] [val/loss_avg_len_1024: 4.377] [val/perplexity_len_1024: 79.617] [val/loss_avg_len_512: 4.417] [val/perplexity_len_512: 82.825]
|
| 241 |
+
[2025-10-26 01:24:33][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt...
|
| 242 |
+
[2025-10-26 01:24:34][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt.
|
| 243 |
+
[2025-10-26 01:24:34][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.425]
|
| 244 |
+
[2025-10-26 01:25:43][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:55:38] [ETA: 1:13:56] [loss: 4.355] [tokens/s: 182630.663] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 245 |
+
[2025-10-26 01:26:52][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:56:47] [ETA: 1:11:35] [loss: 4.330] [tokens/s: 198437.564] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 246 |
+
[2025-10-26 01:26:52][train:194][INFO] Running validation...
|
| 247 |
+
[2025-10-26 01:28:22][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 7007.788] [val/train_update_time: 4282.136] [val/loss: 4.335] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.333] [val/val_tokens_per_second: 453432.242] [val/loss_avg_len_2048: 4.335] [val/perplexity_len_2048: 76.355] [val/loss_avg_len_1024: 4.358] [val/perplexity_len_1024: 78.108] [val/loss_avg_len_512: 4.399] [val/perplexity_len_512: 81.393]
|
| 248 |
+
[2025-10-26 01:29:31][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:59:27] [ETA: 1:10:09] [loss: 4.340] [tokens/s: 182663.673] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 249 |
+
[2025-10-26 01:30:40][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:00:36] [ETA: 1:07:50] [loss: 4.330] [tokens/s: 198432.802] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 250 |
+
[2025-10-26 01:30:40][train:194][INFO] Running validation...
|
| 251 |
+
[2025-10-26 01:32:11][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 7236.399] [val/train_update_time: 4420.153] [val/loss: 4.316] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.292] [val/val_tokens_per_second: 453637.946] [val/loss_avg_len_2048: 4.316] [val/perplexity_len_2048: 74.908] [val/loss_avg_len_1024: 4.340] [val/perplexity_len_1024: 76.702] [val/loss_avg_len_512: 4.383] [val/perplexity_len_512: 80.063]
|
| 252 |
+
[2025-10-26 01:33:20][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:03:15] [ETA: 1:06:22] [loss: 4.304] [tokens/s: 182657.918] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 253 |
+
[2025-10-26 01:33:20][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7395.903] [train_eval/train_update_time: 4489.238] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.333] [train_eval/perplexity_len_2048: 76.161] [train_eval/loss_avg_len_1024: 4.355] [train_eval/perplexity_len_1024: 77.844] [train_eval/loss_avg_len_512: 4.394] [train_eval/perplexity_len_512: 80.986]
|
| 254 |
+
[2025-10-26 01:34:29][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:04:25] [ETA: 1:04:05] [loss: 4.326] [tokens/s: 198431.420] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 255 |
+
[2025-10-26 01:34:29][train:194][INFO] Running validation...
|
| 256 |
+
[2025-10-26 01:35:59][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 7465.047] [val/train_update_time: 4558.262] [val/loss: 4.300] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.423] [val/val_tokens_per_second: 452981.250] [val/loss_avg_len_2048: 4.300] [val/perplexity_len_2048: 73.676] [val/loss_avg_len_1024: 4.324] [val/perplexity_len_1024: 75.497] [val/loss_avg_len_512: 4.368] [val/perplexity_len_512: 78.908]
|
| 257 |
+
[2025-10-26 01:37:09][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 2:07:04] [ETA: 1:02:35] [loss: 4.270] [tokens/s: 182644.323] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 258 |
+
[2025-10-26 01:38:18][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 2:08:13] [ETA: 1:00:20] [loss: 4.284] [tokens/s: 198413.163] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 259 |
+
[2025-10-26 01:38:18][train:194][INFO] Running validation...
|
| 260 |
+
[2025-10-26 01:39:48][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 7693.756] [val/train_update_time: 4696.292] [val/loss: 4.283] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.484] [val/val_tokens_per_second: 452677.817] [val/loss_avg_len_2048: 4.283] [val/perplexity_len_2048: 72.475] [val/loss_avg_len_1024: 4.308] [val/perplexity_len_1024: 74.326] [val/loss_avg_len_512: 4.354] [val/perplexity_len_512: 77.797]
|
| 261 |
+
[2025-10-26 01:40:57][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 2:10:53] [ETA: 0:58:48] [loss: 4.285] [tokens/s: 182623.466] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 262 |
+
[2025-10-26 01:42:07][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 2:12:02] [ETA: 0:56:35] [loss: 4.280] [tokens/s: 198479.390] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 263 |
+
[2025-10-26 01:42:07][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7922.504] [train_eval/train_update_time: 4834.296] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.286] [train_eval/perplexity_len_2048: 72.709] [train_eval/loss_avg_len_1024: 4.310] [train_eval/perplexity_len_1024: 74.408] [train_eval/loss_avg_len_512: 4.354] [train_eval/perplexity_len_512: 77.806]
|
| 264 |
+
[2025-10-26 01:42:07][train:194][INFO] Running validation...
|
| 265 |
+
[2025-10-26 01:43:37][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 7922.504] [val/train_update_time: 4834.296] [val/loss: 4.268] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.493] [val/val_tokens_per_second: 452631.090] [val/loss_avg_len_2048: 4.268] [val/perplexity_len_2048: 71.372] [val/loss_avg_len_1024: 4.294] [val/perplexity_len_1024: 73.263] [val/loss_avg_len_512: 4.341] [val/perplexity_len_512: 76.801]
|
| 266 |
+
[2025-10-26 01:43:37][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt...
|
| 267 |
+
[2025-10-26 01:43:37][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt.
|
| 268 |
+
[2025-10-26 01:43:37][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.418]
|
| 269 |
+
[2025-10-26 01:44:47][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:14:42] [ETA: 0:55:01] [loss: 4.276] [tokens/s: 182603.460] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 270 |
+
[2025-10-26 01:45:56][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:15:51] [ETA: 0:52:50] [loss: 4.252] [tokens/s: 198359.914] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 271 |
+
[2025-10-26 01:45:56][train:194][INFO] Running validation...
|
| 272 |
+
[2025-10-26 01:47:26][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 8151.724] [val/train_update_time: 4972.330] [val/loss: 4.254] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.510] [val/val_tokens_per_second: 452548.331] [val/loss_avg_len_2048: 4.254] [val/perplexity_len_2048: 70.376] [val/loss_avg_len_1024: 4.281] [val/perplexity_len_1024: 72.310] [val/loss_avg_len_512: 4.330] [val/perplexity_len_512: 75.922]
|
| 273 |
+
[2025-10-26 01:48:35][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:18:31] [ETA: 0:51:14] [loss: 4.270] [tokens/s: 182564.512] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 274 |
+
[2025-10-26 01:49:45][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:19:40] [ETA: 0:49:04] [loss: 4.255] [tokens/s: 198260.601] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 275 |
+
[2025-10-26 01:49:45][train:194][INFO] Running validation...
|
| 276 |
+
[2025-10-26 01:51:16][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 8380.819] [val/train_update_time: 5110.375] [val/loss: 4.241] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.755] [val/val_tokens_per_second: 451326.849] [val/loss_avg_len_2048: 4.241] [val/perplexity_len_2048: 69.511] [val/loss_avg_len_1024: 4.269] [val/perplexity_len_1024: 71.462] [val/loss_avg_len_512: 4.319] [val/perplexity_len_512: 75.112]
|
| 277 |
+
[2025-10-26 01:52:25][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:22:20] [ETA: 0:47:26] [loss: 4.237] [tokens/s: 182421.846] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 278 |
+
[2025-10-26 01:52:25][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8540.947] [train_eval/train_update_time: 5179.398] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.249] [train_eval/perplexity_len_2048: 70.062] [train_eval/loss_avg_len_1024: 4.274] [train_eval/perplexity_len_1024: 71.799] [train_eval/loss_avg_len_512: 4.322] [train_eval/perplexity_len_512: 75.318]
|
| 279 |
+
[2025-10-26 01:53:34][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:23:30] [ETA: 0:45:19] [loss: 4.199] [tokens/s: 198122.402] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 280 |
+
[2025-10-26 01:53:34][train:194][INFO] Running validation...
|
| 281 |
+
[2025-10-26 01:55:05][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 8610.334] [val/train_update_time: 5248.407] [val/loss: 4.231] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.346] [val/val_tokens_per_second: 453369.123] [val/loss_avg_len_2048: 4.231] [val/perplexity_len_2048: 68.757] [val/loss_avg_len_1024: 4.259] [val/perplexity_len_1024: 70.759] [val/loss_avg_len_512: 4.311] [val/perplexity_len_512: 74.496]
|
| 282 |
+
[2025-10-26 01:56:14][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:26:09] [ETA: 0:43:39] [loss: 4.263] [tokens/s: 182369.045] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 283 |
+
[2025-10-26 01:57:23][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:27:19] [ETA: 0:41:33] [loss: 4.185] [tokens/s: 198115.776] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 284 |
+
[2025-10-26 01:57:23][train:194][INFO] Running validation...
|
| 285 |
+
[2025-10-26 01:58:54][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 8839.132] [val/train_update_time: 5386.446] [val/loss: 4.220] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.385] [val/val_tokens_per_second: 453173.950] [val/loss_avg_len_2048: 4.220] [val/perplexity_len_2048: 68.063] [val/loss_avg_len_1024: 4.249] [val/perplexity_len_1024: 70.070] [val/loss_avg_len_512: 4.302] [val/perplexity_len_512: 73.822]
|
| 286 |
+
[2025-10-26 02:00:03][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:29:58] [ETA: 0:39:52] [loss: 4.214] [tokens/s: 182381.375] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 287 |
+
[2025-10-26 02:01:12][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:31:07] [ETA: 0:37:46] [loss: 4.192] [tokens/s: 198205.351] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 288 |
+
[2025-10-26 02:01:12][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9067.838] [train_eval/train_update_time: 5524.482] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.222] [train_eval/perplexity_len_2048: 68.188] [train_eval/loss_avg_len_1024: 4.250] [train_eval/perplexity_len_1024: 70.083] [train_eval/loss_avg_len_512: 4.300] [train_eval/perplexity_len_512: 73.695]
|
| 289 |
+
[2025-10-26 02:01:12][train:194][INFO] Running validation...
|
| 290 |
+
[2025-10-26 02:02:42][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 9067.838] [val/train_update_time: 5524.482] [val/loss: 4.212] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.442] [val/val_tokens_per_second: 452885.529] [val/loss_avg_len_2048: 4.212] [val/perplexity_len_2048: 67.497] [val/loss_avg_len_1024: 4.242] [val/perplexity_len_1024: 69.530] [val/loss_avg_len_512: 4.295] [val/perplexity_len_512: 73.334]
|
| 291 |
+
[2025-10-26 02:02:42][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt...
|
| 292 |
+
[2025-10-26 02:02:43][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt.
|
| 293 |
+
[2025-10-26 02:02:43][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.421]
|
| 294 |
+
[2025-10-26 02:03:52][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:33:47] [ETA: 0:36:04] [loss: 4.168] [tokens/s: 182382.490] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 295 |
+
[2025-10-26 02:05:01][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:34:56] [ETA: 0:34:00] [loss: 4.179] [tokens/s: 198141.293] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 296 |
+
[2025-10-26 02:05:01][train:194][INFO] Running validation...
|
| 297 |
+
[2025-10-26 02:06:31][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 9296.993] [val/train_update_time: 5662.514] [val/loss: 4.205] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.358] [val/val_tokens_per_second: 453306.293] [val/loss_avg_len_2048: 4.205] [val/perplexity_len_2048: 66.989] [val/loss_avg_len_1024: 4.234] [val/perplexity_len_1024: 69.027] [val/loss_avg_len_512: 4.288] [val/perplexity_len_512: 72.843]
|
| 298 |
+
[2025-10-26 02:07:41][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:37:36] [ETA: 0:32:16] [loss: 4.212] [tokens/s: 182420.625] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 299 |
+
[2025-10-26 02:08:50][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:38:45] [ETA: 0:30:14] [loss: 4.161] [tokens/s: 198271.199] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 300 |
+
[2025-10-26 02:08:50][train:194][INFO] Running validation...
|
| 301 |
+
[2025-10-26 02:10:20][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 9525.670] [val/train_update_time: 5800.553] [val/loss: 4.198] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.729] [val/val_tokens_per_second: 451454.995] [val/loss_avg_len_2048: 4.198] [val/perplexity_len_2048: 66.576] [val/loss_avg_len_1024: 4.229] [val/perplexity_len_1024: 68.627] [val/loss_avg_len_512: 4.283] [val/perplexity_len_512: 72.466]
|
| 302 |
+
[2025-10-26 02:11:30][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:41:25] [ETA: 0:28:29] [loss: 4.230] [tokens/s: 182491.313] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 303 |
+
[2025-10-26 02:11:30][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9685.575] [train_eval/train_update_time: 5869.583] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.195] [train_eval/perplexity_len_2048: 66.370] [train_eval/loss_avg_len_1024: 4.218] [train_eval/perplexity_len_1024: 67.917] [train_eval/loss_avg_len_512: 4.271] [train_eval/perplexity_len_512: 71.572]
|
| 304 |
+
[2025-10-26 02:12:39][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:42:34] [ETA: 0:26:27] [loss: 4.209] [tokens/s: 198284.923] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 305 |
+
[2025-10-26 02:12:39][train:194][INFO] Running validation...
|
| 306 |
+
[2025-10-26 02:14:09][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 9754.736] [val/train_update_time: 5938.595] [val/loss: 4.193] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.749] [val/val_tokens_per_second: 451353.559] [val/loss_avg_len_2048: 4.193] [val/perplexity_len_2048: 66.246] [val/loss_avg_len_1024: 4.224] [val/perplexity_len_1024: 68.309] [val/loss_avg_len_512: 4.279] [val/perplexity_len_512: 72.173]
|
| 307 |
+
[2025-10-26 02:15:19][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:45:14] [ETA: 0:24:41] [loss: 4.155] [tokens/s: 182484.674] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 308 |
+
[2025-10-26 02:16:28][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:46:23] [ETA: 0:22:41] [loss: 4.173] [tokens/s: 198233.899] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 309 |
+
[2025-10-26 02:16:28][train:194][INFO] Running validation...
|
| 310 |
+
[2025-10-26 02:17:58][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 9983.788] [val/train_update_time: 6076.625] [val/loss: 4.189] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.679] [val/val_tokens_per_second: 451703.453] [val/loss_avg_len_2048: 4.189] [val/perplexity_len_2048: 65.985] [val/loss_avg_len_1024: 4.220] [val/perplexity_len_1024: 68.047] [val/loss_avg_len_512: 4.276] [val/perplexity_len_512: 71.917]
|
| 311 |
+
[2025-10-26 02:19:08][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:49:03] [ETA: 0:20:53] [loss: 4.228] [tokens/s: 182438.062] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 312 |
+
[2025-10-26 02:20:17][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:50:12] [ETA: 0:18:54] [loss: 4.143] [tokens/s: 198271.009] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 313 |
+
[2025-10-26 02:20:17][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10212.780] [train_eval/train_update_time: 6214.660] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.189] [train_eval/perplexity_len_2048: 65.932] [train_eval/loss_avg_len_1024: 4.215] [train_eval/perplexity_len_1024: 67.699] [train_eval/loss_avg_len_512: 4.269] [train_eval/perplexity_len_512: 71.441]
|
| 314 |
+
[2025-10-26 02:20:17][train:194][INFO] Running validation...
|
| 315 |
+
[2025-10-26 02:21:48][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 10212.780] [val/train_update_time: 6214.660] [val/loss: 4.186] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.996] [val/val_tokens_per_second: 450130.993] [val/loss_avg_len_2048: 4.186] [val/perplexity_len_2048: 65.792] [val/loss_avg_len_1024: 4.218] [val/perplexity_len_1024: 67.865] [val/loss_avg_len_512: 4.273] [val/perplexity_len_512: 71.752]
|
| 316 |
+
[2025-10-26 02:21:48][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt...
|
| 317 |
+
[2025-10-26 02:21:48][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt.
|
| 318 |
+
[2025-10-26 02:21:48][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.417]
|
| 319 |
+
[2025-10-26 02:22:57][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:52:53] [ETA: 0:17:05] [loss: 4.178] [tokens/s: 182349.471] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 320 |
+
[2025-10-26 02:24:07][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 2:54:02] [ETA: 0:15:08] [loss: 4.206] [tokens/s: 198067.474] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 321 |
+
[2025-10-26 02:24:07][train:194][INFO] Running validation...
|
| 322 |
+
[2025-10-26 02:25:37][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 10442.504] [val/train_update_time: 6352.693] [val/loss: 4.184] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.351] [val/val_tokens_per_second: 453340.782] [val/loss_avg_len_2048: 4.184] [val/perplexity_len_2048: 65.651] [val/loss_avg_len_1024: 4.216] [val/perplexity_len_1024: 67.731] [val/loss_avg_len_512: 4.272] [val/perplexity_len_512: 71.633]
|
| 323 |
+
[2025-10-26 02:26:46][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 2:56:42] [ETA: 0:13:18] [loss: 4.191] [tokens/s: 182351.845] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 324 |
+
[2025-10-26 02:27:55][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 2:57:51] [ETA: 0:11:21] [loss: 4.148] [tokens/s: 198138.717] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
|
| 325 |
+
[2025-10-26 02:27:55][train:194][INFO] Running validation...
|
| 326 |
+
[2025-10-26 02:29:26][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 10671.172] [val/train_update_time: 6490.743] [val/loss: 4.183] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.350] [val/val_tokens_per_second: 453345.845] [val/loss_avg_len_2048: 4.183] [val/perplexity_len_2048: 65.564] [val/loss_avg_len_1024: 4.214] [val/perplexity_len_1024: 67.646] [val/loss_avg_len_512: 4.270] [val/perplexity_len_512: 71.551]
|
| 327 |
+
[2025-10-26 02:30:35][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 3:00:30] [ETA: 0:09:30] [loss: 4.170] [tokens/s: 182410.554] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 328 |
+
[2025-10-26 02:30:35][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10830.689] [train_eval/train_update_time: 6559.780] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.176] [train_eval/perplexity_len_2048: 65.117] [train_eval/loss_avg_len_1024: 4.206] [train_eval/perplexity_len_1024: 67.086] [train_eval/loss_avg_len_512: 4.259] [train_eval/perplexity_len_512: 70.759]
|
| 329 |
+
[2025-10-26 02:31:44][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 3:01:39] [ETA: 0:07:34] [loss: 4.169] [tokens/s: 198210.554] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 330 |
+
[2025-10-26 02:31:44][train:194][INFO] Running validation...
|
| 331 |
+
[2025-10-26 02:33:14][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 10899.878] [val/train_update_time: 6628.827] [val/loss: 4.182] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.304] [val/val_tokens_per_second: 453579.534] [val/loss_avg_len_2048: 4.182] [val/perplexity_len_2048: 65.512] [val/loss_avg_len_1024: 4.214] [val/perplexity_len_1024: 67.593] [val/loss_avg_len_512: 4.270] [val/perplexity_len_512: 71.498]
|
| 332 |
+
[2025-10-26 02:34:23][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 3:04:19] [ETA: 0:05:42] [loss: 4.193] [tokens/s: 182477.451] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
|
| 333 |
+
[2025-10-26 02:35:33][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 3:05:28] [ETA: 0:03:47] [loss: 4.168] [tokens/s: 198277.075] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
|
| 334 |
+
[2025-10-26 02:35:33][train:194][INFO] Running validation...
|
| 335 |
+
[2025-10-26 02:37:03][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 11128.508] [val/train_update_time: 6766.881] [val/loss: 4.182] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.112] [val/val_tokens_per_second: 454544.122] [val/loss_avg_len_2048: 4.182] [val/perplexity_len_2048: 65.491] [val/loss_avg_len_1024: 4.213] [val/perplexity_len_1024: 67.574] [val/loss_avg_len_512: 4.269] [val/perplexity_len_512: 71.481]
|
| 336 |
+
[2025-10-26 02:37:03][train:854][INFO] Training finished with 2055208960 tokens!
|
metrics/jsonlines/checkpoint.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"step": 209715200, "checkpoint/checkpoint_time": 0.
|
| 2 |
-
{"step": 419430400, "checkpoint/checkpoint_time": 0.
|
| 3 |
-
{"step": 629145600, "checkpoint/checkpoint_time": 0.
|
| 4 |
-
{"step": 838860800, "checkpoint/checkpoint_time": 0.
|
| 5 |
-
{"step": 1048576000, "checkpoint/checkpoint_time": 0.
|
| 6 |
-
{"step": 1258291200, "checkpoint/checkpoint_time": 0.
|
| 7 |
-
{"step": 1468006400, "checkpoint/checkpoint_time": 0.
|
| 8 |
-
{"step": 1677721600, "checkpoint/checkpoint_time": 0.
|
| 9 |
-
{"step": 1887436800, "checkpoint/checkpoint_time": 0.
|
|
|
|
| 1 |
+
{"step": 209715200, "checkpoint/checkpoint_time": 0.42509283899562433}
|
| 2 |
+
{"step": 419430400, "checkpoint/checkpoint_time": 0.4211389650008641}
|
| 3 |
+
{"step": 629145600, "checkpoint/checkpoint_time": 0.41736824897816405}
|
| 4 |
+
{"step": 838860800, "checkpoint/checkpoint_time": 0.42778749897843227}
|
| 5 |
+
{"step": 1048576000, "checkpoint/checkpoint_time": 0.41318527003750205}
|
| 6 |
+
{"step": 1258291200, "checkpoint/checkpoint_time": 0.42481468996265903}
|
| 7 |
+
{"step": 1468006400, "checkpoint/checkpoint_time": 0.4177210059715435}
|
| 8 |
+
{"step": 1677721600, "checkpoint/checkpoint_time": 0.4209941530134529}
|
| 9 |
+
{"step": 1887436800, "checkpoint/checkpoint_time": 0.41716638999059796}
|
metrics/jsonlines/throughput.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/train.jsonl
CHANGED
|
@@ -1,98 +1,98 @@
|
|
| 1 |
-
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time":
|
| 2 |
-
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 142.
|
| 3 |
-
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time":
|
| 4 |
-
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time":
|
| 5 |
-
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 530.
|
| 6 |
-
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 599.
|
| 7 |
-
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time":
|
| 8 |
-
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time":
|
| 9 |
-
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time":
|
| 10 |
-
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time":
|
| 11 |
-
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time":
|
| 12 |
-
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time":
|
| 13 |
-
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time":
|
| 14 |
-
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time":
|
| 15 |
-
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time":
|
| 16 |
-
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time":
|
| 17 |
-
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time":
|
| 18 |
-
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time":
|
| 19 |
-
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time":
|
| 20 |
-
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time":
|
| 21 |
-
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time":
|
| 22 |
-
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time":
|
| 23 |
-
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time":
|
| 24 |
-
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time":
|
| 25 |
-
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time":
|
| 26 |
-
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time":
|
| 27 |
-
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time":
|
| 28 |
-
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time":
|
| 29 |
-
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time":
|
| 30 |
-
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time":
|
| 31 |
-
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time":
|
| 32 |
-
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time":
|
| 33 |
-
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time":
|
| 34 |
-
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time":
|
| 35 |
-
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time":
|
| 36 |
-
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time":
|
| 37 |
-
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time":
|
| 38 |
-
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time":
|
| 39 |
-
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time":
|
| 40 |
-
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time":
|
| 41 |
-
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time":
|
| 42 |
-
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time":
|
| 43 |
-
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time":
|
| 44 |
-
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time":
|
| 45 |
-
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time":
|
| 46 |
-
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time":
|
| 47 |
-
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time":
|
| 48 |
-
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time":
|
| 49 |
-
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time":
|
| 50 |
-
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time":
|
| 51 |
-
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time":
|
| 52 |
-
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time":
|
| 53 |
-
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time":
|
| 54 |
-
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time":
|
| 55 |
-
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time":
|
| 56 |
-
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time":
|
| 57 |
-
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time":
|
| 58 |
-
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time":
|
| 59 |
-
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time":
|
| 60 |
-
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time":
|
| 61 |
-
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time":
|
| 62 |
-
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time":
|
| 63 |
-
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time":
|
| 64 |
-
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time":
|
| 65 |
-
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time":
|
| 66 |
-
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time":
|
| 67 |
-
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time":
|
| 68 |
-
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time":
|
| 69 |
-
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time":
|
| 70 |
-
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time":
|
| 71 |
-
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time":
|
| 72 |
-
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time":
|
| 73 |
-
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time":
|
| 74 |
-
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time":
|
| 75 |
-
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time":
|
| 76 |
-
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time":
|
| 77 |
-
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time":
|
| 78 |
-
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time":
|
| 79 |
-
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time":
|
| 80 |
-
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time":
|
| 81 |
-
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time":
|
| 82 |
-
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time":
|
| 83 |
-
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time":
|
| 84 |
-
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time":
|
| 85 |
-
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time":
|
| 86 |
-
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time":
|
| 87 |
-
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time":
|
| 88 |
-
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time":
|
| 89 |
-
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time":
|
| 90 |
-
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time":
|
| 91 |
-
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time":
|
| 92 |
-
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time":
|
| 93 |
-
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time":
|
| 94 |
-
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time":
|
| 95 |
-
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time":
|
| 96 |
-
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time":
|
| 97 |
-
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time":
|
| 98 |
-
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time":
|
|
|
|
| 1 |
+
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 73.10528613603674, "train/update_time": 72.90000317717204, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346482276916504}
|
| 2 |
+
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 142.2743778140284, "train/update_time": 141.94980711926473, "train/lr": 0.0009997960964140947, "train/loss": 8.126625061035156, "train/global_grad_norm": 0.962837278842926}
|
| 3 |
+
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 301.7193861359847, "train/update_time": 210.97775525326142, "train/lr": 0.0009990914580222257, "train/loss": 7.519778728485107, "train/global_grad_norm": 0.5695855021476746}
|
| 4 |
+
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 370.84139602299547, "train/update_time": 279.98310620122356, "train/lr": 0.0009978842768382998, "train/loss": 7.193304061889648, "train/global_grad_norm": 0.4217643439769745}
|
| 5 |
+
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 530.2946584849851, "train/update_time": 348.9749472962576, "train/lr": 0.0009961757683914405, "train/loss": 6.9472150802612305, "train/global_grad_norm": 0.26760002970695496}
|
| 6 |
+
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 599.4013552149991, "train/update_time": 417.94741392938886, "train/lr": 0.00099396765300483, "train/loss": 6.68041467666626, "train/global_grad_norm": 0.31579363346099854}
|
| 7 |
+
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 758.8880543989944, "train/update_time": 486.9278290383518, "train/lr": 0.0009912621540634887, "train/loss": 6.480125904083252, "train/global_grad_norm": 0.26012396812438965}
|
| 8 |
+
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 827.9917874370003, "train/update_time": 555.9067850944703, "train/lr": 0.000988061995775515, "train/loss": 6.281551837921143, "train/global_grad_norm": 0.39679110050201416}
|
| 9 |
+
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 987.436578762019, "train/update_time": 624.905722066469, "train/lr": 0.0009843704004290394, "train/loss": 6.122912406921387, "train/global_grad_norm": 1.23171067237854}
|
| 10 |
+
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1056.5339453950291, "train/update_time": 693.8810286904918, "train/lr": 0.0009801910851476522, "train/loss": 5.9722723960876465, "train/global_grad_norm": 0.3574962913990021}
|
| 11 |
+
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1216.3542258110247, "train/update_time": 762.8519944375148, "train/lr": 0.0009755282581475768, "train/loss": 5.849911212921143, "train/global_grad_norm": 0.38126564025878906}
|
| 12 |
+
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1285.4777986719855, "train/update_time": 831.8405811304692, "train/lr": 0.0009703866145003512, "train/loss": 5.7178874015808105, "train/global_grad_norm": 0.6952179670333862}
|
| 13 |
+
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1444.8941518919892, "train/update_time": 900.8191652273526, "train/lr": 0.0009647713314052896, "train/loss": 5.644232749938965, "train/global_grad_norm": 0.34717857837677}
|
| 14 |
+
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1514.0213319549803, "train/update_time": 969.8139603384188, "train/lr": 0.0009586880629764817, "train/loss": 5.570384502410889, "train/global_grad_norm": 0.6765910983085632}
|
| 15 |
+
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1673.719632464985, "train/update_time": 1038.808761139284, "train/lr": 0.0009521429345495787, "train/loss": 5.444611072540283, "train/global_grad_norm": 0.4169935882091522}
|
| 16 |
+
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1742.8283430220326, "train/update_time": 1107.798082921363, "train/lr": 0.0009451425365140996, "train/loss": 5.40510368347168, "train/global_grad_norm": 0.709697961807251}
|
| 17 |
+
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1902.3611605030019, "train/update_time": 1176.7872661272995, "train/lr": 0.000937693917677468, "train/loss": 5.298379421234131, "train/global_grad_norm": 0.35993462800979614}
|
| 18 |
+
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1971.4918935780297, "train/update_time": 1245.7879514921806, "train/lr": 0.0009298045781674596, "train/loss": 5.267183303833008, "train/global_grad_norm": 0.45855849981307983}
|
| 19 |
+
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2130.959315415006, "train/update_time": 1314.7911676210933, "train/lr": 0.0009214824618802108, "train/loss": 5.240725994110107, "train/global_grad_norm": 0.45877301692962646}
|
| 20 |
+
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2200.08279568702, "train/update_time": 1383.788816100161, "train/lr": 0.000912735948481387, "train/loss": 5.148595809936523, "train/global_grad_norm": 0.5232999920845032}
|
| 21 |
+
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2360.0452178320265, "train/update_time": 1452.785164519155, "train/lr": 0.0009035738449685707, "train/loss": 5.102267742156982, "train/global_grad_norm": 0.40673965215682983}
|
| 22 |
+
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2429.163681267004, "train/update_time": 1521.7770175782498, "train/lr": 0.0008940053768033609, "train/loss": 5.072765827178955, "train/global_grad_norm": 0.540256679058075}
|
| 23 |
+
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2589.090652415005, "train/update_time": 1590.7705863612937, "train/lr": 0.0008840401786221159, "train/loss": 5.013406276702881, "train/global_grad_norm": 0.4202441871166229}
|
| 24 |
+
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2658.232374906016, "train/update_time": 1659.7850940762437, "train/lr": 0.0008736882845346905, "train/loss": 4.965211868286133, "train/global_grad_norm": 0.5850781798362732}
|
| 25 |
+
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2818.422068757005, "train/update_time": 1728.7950774162891, "train/lr": 0.0008629601180209381, "train/loss": 4.961833477020264, "train/global_grad_norm": 0.6340895295143127}
|
| 26 |
+
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2887.563738073979, "train/update_time": 1797.8180341873667, "train/lr": 0.0008518664814351503, "train/loss": 4.912302017211914, "train/global_grad_norm": 0.5044277310371399}
|
| 27 |
+
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3047.4022633209825, "train/update_time": 1866.8200742353802, "train/lr": 0.0008404185451290017, "train/loss": 4.897612571716309, "train/global_grad_norm": 0.4688912034034729}
|
| 28 |
+
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3116.5153146539815, "train/update_time": 1935.8127364134416, "train/lr": 0.0008286278362039527, "train/loss": 4.848834037780762, "train/global_grad_norm": 0.6365319490432739}
|
| 29 |
+
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3276.0374568690313, "train/update_time": 2004.8135134153417, "train/lr": 0.0008165062269044352, "train/loss": 4.8169732093811035, "train/global_grad_norm": 0.4134746789932251}
|
| 30 |
+
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3345.161585650989, "train/update_time": 2073.822410382272, "train/lr": 0.0008040659226635089, "train/loss": 4.79654598236084, "train/global_grad_norm": 0.5643511414527893}
|
| 31 |
+
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3504.929979883018, "train/update_time": 2142.822337330319, "train/lr": 0.0007913194498130252, "train/loss": 4.810868740081787, "train/global_grad_norm": 0.47013285756111145}
|
| 32 |
+
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3574.040140778001, "train/update_time": 2211.8163213434746, "train/lr": 0.000778279642970672, "train/loss": 4.74250602722168, "train/global_grad_norm": 0.5142323970794678}
|
| 33 |
+
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3733.7000970160007, "train/update_time": 2280.8237289965036, "train/lr": 0.0007649596321166025, "train/loss": 4.759753704071045, "train/global_grad_norm": 0.5028547644615173}
|
| 34 |
+
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3802.836212512979, "train/update_time": 2349.8270930235158, "train/lr": 0.0007513728293726579, "train/loss": 4.724730491638184, "train/global_grad_norm": 0.5188063383102417}
|
| 35 |
+
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3962.514591771993, "train/update_time": 2418.8274328135885, "train/lr": 0.0007375329154974975, "train/loss": 4.704092502593994, "train/global_grad_norm": 0.4179239571094513}
|
| 36 |
+
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4031.6405439740047, "train/update_time": 2487.827474080492, "train/lr": 0.0007234538261112341, "train/loss": 4.630825042724609, "train/global_grad_norm": 0.4399227201938629}
|
| 37 |
+
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4191.252879804, "train/update_time": 2556.8249263644684, "train/lr": 0.0007091497376634464, "train/loss": 4.655548095703125, "train/global_grad_norm": 0.45650508999824524}
|
| 38 |
+
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4260.37410283502, "train/update_time": 2625.818530491437, "train/lr": 0.0006946350531586958, "train/loss": 4.63443660736084, "train/global_grad_norm": 0.4673406481742859}
|
| 39 |
+
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4420.292360802006, "train/update_time": 2694.823069378559, "train/lr": 0.0006799243876539214, "train/loss": 4.639521598815918, "train/global_grad_norm": 0.5377744436264038}
|
| 40 |
+
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4489.443470049009, "train/update_time": 2763.8374379616, "train/lr": 0.0006650325535423166, "train/loss": 4.547835826873779, "train/global_grad_norm": 0.5047109127044678}
|
| 41 |
+
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4649.7579026630265, "train/update_time": 2832.8449664485524, "train/lr": 0.0006499745456385053, "train/loss": 4.572357654571533, "train/global_grad_norm": 0.6879011392593384}
|
| 42 |
+
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4718.872143707995, "train/update_time": 2901.8397621414624, "train/lr": 0.0006347655260800339, "train/loss": 4.565418720245361, "train/global_grad_norm": 0.428315132856369}
|
| 43 |
+
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4878.879235523986, "train/update_time": 2970.8549301693565, "train/lr": 0.0006194208090603844, "train/loss": 4.560233116149902, "train/global_grad_norm": 0.45447441935539246}
|
| 44 |
+
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4947.998132903012, "train/update_time": 3039.859961154347, "train/lr": 0.0006039558454088796, "train/loss": 4.5870771408081055, "train/global_grad_norm": 0.7089611887931824}
|
| 45 |
+
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5107.660124632996, "train/update_time": 3108.876222961524, "train/lr": 0.0005883862070330078, "train/loss": 4.5283427238464355, "train/global_grad_norm": 0.4208521842956543}
|
| 46 |
+
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5176.88071361999, "train/update_time": 3177.899590641551, "train/lr": 0.0005727275712388317, "train/loss": 4.496908187866211, "train/global_grad_norm": 0.6397818922996521}
|
| 47 |
+
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5336.778706843033, "train/update_time": 3246.908200990525, "train/lr": 0.0005569957049452703, "train/loss": 4.518903732299805, "train/global_grad_norm": 0.5339348316192627}
|
| 48 |
+
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5405.913057841011, "train/update_time": 3315.925435980549, "train/lr": 0.0005412064488081482, "train/loss": 4.495401382446289, "train/global_grad_norm": 0.47157326340675354}
|
| 49 |
+
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5565.781587979989, "train/update_time": 3384.9316106255865, "train/lr": 0.0005253757012699972, "train/loss": 4.490736484527588, "train/global_grad_norm": 0.5239655375480652}
|
| 50 |
+
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5634.922143466014, "train/update_time": 3453.9653959476273, "train/lr": 0.0005095194025516734, "train/loss": 4.4643659591674805, "train/global_grad_norm": 0.5247243642807007}
|
| 51 |
+
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5794.91904747003, "train/update_time": 3522.983512793493, "train/lr": 0.0004936535186019053, "train/loss": 4.463287353515625, "train/global_grad_norm": 0.4336317479610443}
|
| 52 |
+
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5864.05088400899, "train/update_time": 3591.9979424396297, "train/lr": 0.00047779402502093696, "train/loss": 4.457107067108154, "train/global_grad_norm": 0.6947441101074219}
|
| 53 |
+
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6023.732085202006, "train/update_time": 3661.0160827066866, "train/lr": 0.0004619568909744525, "train/loss": 4.4143757820129395, "train/global_grad_norm": 0.45258453488349915}
|
| 54 |
+
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6092.851395011006, "train/update_time": 3730.020544492756, "train/lr": 0.00044615806311398067, "train/loss": 4.424180030822754, "train/global_grad_norm": 0.4154273271560669}
|
| 55 |
+
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6252.330847234989, "train/update_time": 3799.045364828722, "train/lr": 0.0004304134495199673, "train/loss": 4.3700270652771, "train/global_grad_norm": 0.3898273706436157}
|
| 56 |
+
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6321.474026971031, "train/update_time": 3868.0566598027945, "train/lr": 0.0004147389036836882, "train/loss": 4.413632869720459, "train/global_grad_norm": 0.5425747036933899}
|
| 57 |
+
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6480.964678196993, "train/update_time": 3937.06809708284, "train/lr": 0.0003991502085441259, "train/loss": 4.3622026443481445, "train/global_grad_norm": 0.45439326763153076}
|
| 58 |
+
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6550.097681444022, "train/update_time": 4006.072604118788, "train/lr": 0.0003836630605958888, "train/loss": 4.410221576690674, "train/global_grad_norm": 0.4280547499656677}
|
| 59 |
+
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6709.588647256023, "train/update_time": 4075.0957091488526, "train/lr": 0.00036829305408417155, "train/loss": 4.391324520111084, "train/global_grad_norm": 0.42996275424957275}
|
| 60 |
+
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6778.730279813986, "train/update_time": 4144.1211769738, "train/lr": 0.000353055665302672, "train/loss": 4.390552997589111, "train/global_grad_norm": 0.6177342534065247}
|
| 61 |
+
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6938.661827062024, "train/update_time": 4213.1262717307545, "train/lr": 0.0003379662370102746, "train/loss": 4.355296611785889, "train/global_grad_norm": 0.445901095867157}
|
| 62 |
+
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 7007.788445023994, "train/update_time": 4282.135617908789, "train/lr": 0.00032303996298219405, "train/loss": 4.329927444458008, "train/global_grad_norm": 0.4848615527153015}
|
| 63 |
+
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7167.26421629102, "train/update_time": 4351.144582907902, "train/lr": 0.00030829187271113034, "train/loss": 4.3402838706970215, "train/global_grad_norm": 0.42915236949920654}
|
| 64 |
+
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7236.399417778011, "train/update_time": 4420.153171113925, "train/lr": 0.0002937368162738445, "train/loss": 4.330328464508057, "train/global_grad_norm": 0.44172123074531555}
|
| 65 |
+
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7395.902606271033, "train/update_time": 4489.238088100974, "train/lr": 0.0002793894493783894, "train/loss": 4.3035969734191895, "train/global_grad_norm": 0.4424532651901245}
|
| 66 |
+
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7465.047058130032, "train/update_time": 4558.262438459904, "train/lr": 0.00026526421860705474, "train/loss": 4.325634956359863, "train/global_grad_norm": 0.4446793496608734}
|
| 67 |
+
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7624.619702999014, "train/update_time": 4627.277077015955, "train/lr": 0.0002513753468698824, "train/loss": 4.269580841064453, "train/global_grad_norm": 0.4529637098312378}
|
| 68 |
+
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7693.75585325103, "train/update_time": 4696.292382065032, "train/lr": 0.00023773681908340283, "train/loss": 4.283663749694824, "train/global_grad_norm": 0.445527583360672}
|
| 69 |
+
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7853.381023978989, "train/update_time": 4765.2889736949, "train/lr": 0.00022436236808900823, "train/loss": 4.284794807434082, "train/global_grad_norm": 0.37836042046546936}
|
| 70 |
+
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 7922.504303377005, "train/update_time": 4834.2961148990435, "train/lr": 0.00021126546082514682, "train/loss": 4.279749870300293, "train/global_grad_norm": 0.3362836241722107}
|
| 71 |
+
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8082.5694638509885, "train/update_time": 4903.308512775111, "train/lr": 0.00019845928476725522, "train/loss": 4.276471138000488, "train/global_grad_norm": 0.3601376414299011}
|
| 72 |
+
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8151.724266513018, "train/update_time": 4972.329616118164, "train/lr": 0.0001859567346490913, "train/loss": 4.2520365715026855, "train/global_grad_norm": 0.3764491081237793}
|
| 73 |
+
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8311.420260019018, "train/update_time": 5041.351476673037, "train/lr": 0.00017377039947882782, "train/loss": 4.269729137420654, "train/global_grad_norm": 0.3962520360946655}
|
| 74 |
+
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8380.818926771986, "train/update_time": 5110.374592272972, "train/lr": 0.00016191254986299043, "train/loss": 4.254550933837891, "train/global_grad_norm": 0.357697457075119}
|
| 75 |
+
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8540.947155133996, "train/update_time": 5179.397799044964, "train/lr": 0.00015039512565099468, "train/loss": 4.237186431884766, "train/global_grad_norm": 0.34904253482818604}
|
| 76 |
+
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8610.333553528006, "train/update_time": 5248.407137244998, "train/lr": 0.00013922972391273224, "train/loss": 4.198566436767578, "train/global_grad_norm": 0.3618724048137665}
|
| 77 |
+
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 8769.990720756003, "train/update_time": 5317.427211401926, "train/lr": 0.00012842758726130281, "train/loss": 4.263113975524902, "train/global_grad_norm": 0.3145442306995392}
|
| 78 |
+
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 8839.131734684983, "train/update_time": 5386.445536848798, "train/lr": 0.00011799959253265679, "train/loss": 4.1848530769348145, "train/global_grad_norm": 0.3598962128162384}
|
| 79 |
+
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 8998.676407739986, "train/update_time": 5455.463913362706, "train/lr": 0.00010795623983354214, "train/loss": 4.2140374183654785, "train/global_grad_norm": 0.3123509883880615}
|
| 80 |
+
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9067.837796505017, "train/update_time": 5524.4823192786425, "train/lr": 9.830764196878872e-05, "train/loss": 4.1917405128479, "train/global_grad_norm": 0.31881648302078247}
|
| 81 |
+
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9227.859726689, "train/update_time": 5593.497986814589, "train/lr": 8.906351425856951e-05, "train/loss": 4.167685508728027, "train/global_grad_norm": 0.29552316665649414}
|
| 82 |
+
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9296.993160478014, "train/update_time": 5662.513914024632, "train/lr": 8.02331647558977e-05, "train/loss": 4.179322242736816, "train/global_grad_norm": 0.281093567609787}
|
| 83 |
+
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9456.529325363983, "train/update_time": 5731.5381157496595, "train/lr": 7.182548487420554e-05, "train/loss": 4.211834907531738, "train/global_grad_norm": 0.29659828543663025}
|
| 84 |
+
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9525.670015413023, "train/update_time": 5800.553074025724, "train/lr": 6.384894043444556e-05, "train/loss": 4.1608757972717285, "train/global_grad_norm": 0.29815351963043213}
|
| 85 |
+
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9685.574564223003, "train/update_time": 5869.583213592763, "train/lr": 5.6311563140726166e-05, "train/loss": 4.230018138885498, "train/global_grad_norm": 0.2653578221797943}
|
| 86 |
+
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 9754.735574804014, "train/update_time": 5938.594816011784, "train/lr": 4.922094249306547e-05, "train/loss": 4.209297180175781, "train/global_grad_norm": 0.2605638802051544}
|
| 87 |
+
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 9914.648321971006, "train/update_time": 6007.609873749723, "train/lr": 4.2584218145409916e-05, "train/loss": 4.1548752784729, "train/global_grad_norm": 0.2570478022098541}
|
| 88 |
+
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 9983.788254795014, "train/update_time": 6076.624669670709, "train/lr": 3.6408072716606236e-05, "train/loss": 4.172904968261719, "train/global_grad_norm": 0.2740459740161896}
|
| 89 |
+
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10143.620604291034, "train/update_time": 6145.640486821707, "train/lr": 3.069872506157217e-05, "train/loss": 4.228043079376221, "train/global_grad_norm": 0.25757673382759094}
|
| 90 |
+
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10212.780399380019, "train/update_time": 6214.659892122727, "train/lr": 2.5461924009435368e-05, "train/loss": 4.143199920654297, "train/global_grad_norm": 0.2552241086959839}
|
| 91 |
+
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10373.353445779998, "train/update_time": 6283.676360294805, "train/lr": 2.0702942574950812e-05, "train/loss": 4.177771091461182, "train/global_grad_norm": 0.24890665709972382}
|
| 92 |
+
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10442.503988512035, "train/update_time": 6352.692734938697, "train/lr": 1.642657264902142e-05, "train/loss": 4.206305027008057, "train/global_grad_norm": 0.23305842280387878}
|
| 93 |
+
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 10602.015328517009, "train/update_time": 6421.7104710137355, "train/lr": 1.2637120173670358e-05, "train/loss": 4.190739154815674, "train/global_grad_norm": 0.22044338285923004}
|
| 94 |
+
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 10671.172117165988, "train/update_time": 6490.743322437804, "train/lr": 9.338400806321978e-06, "train/loss": 4.147926330566406, "train/global_grad_norm": 0.22512836754322052}
|
| 95 |
+
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 10830.689001108985, "train/update_time": 6559.779601458809, "train/lr": 6.533736077758867e-06, "train/loss": 4.170260429382324, "train/global_grad_norm": 0.22401364147663116}
|
| 96 |
+
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 10899.877774502034, "train/update_time": 6628.827098570764, "train/lr": 4.2259500476214406e-06, "train/loss": 4.168946266174316, "train/global_grad_norm": 0.215094193816185}
|
| 97 |
+
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 11059.34338616603, "train/update_time": 6697.84850771277, "train/lr": 2.417366460819359e-06, "train/loss": 4.192867755889893, "train/global_grad_norm": 0.21194864809513092}
|
| 98 |
+
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 11128.507790005999, "train/update_time": 6766.880580478697, "train/lr": 1.1098064077174619e-06, "train/loss": 4.168134689331055, "train/global_grad_norm": 0.20849043130874634}
|
metrics/jsonlines/train_eval.jsonl
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
-
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 530.
|
| 2 |
-
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 3 |
-
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 4 |
-
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 5 |
-
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 6 |
-
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 7 |
-
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 8 |
-
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 9 |
-
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 10 |
-
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 11 |
-
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 12 |
-
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 13 |
-
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 14 |
-
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 15 |
-
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 16 |
-
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 17 |
-
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 18 |
-
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 19 |
-
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
|
|
|
| 1 |
+
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 530.2946584849851, "train_eval/train_update_time": 348.9749472962576, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262765104495848, "train_eval/perplexity_len_2048": 3876.7990479882474, "train_eval/loss_avg_len_1024": 8.26361274068222, "train_eval/perplexity_len_1024": 3880.086556257262, "train_eval/loss_avg_len_512": 8.264419558200608, "train_eval/perplexity_len_512": 3883.218341283336}
|
| 2 |
+
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1056.5339453950291, "train_eval/train_update_time": 693.8810286904918, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.399099997472659, "train_eval/perplexity_len_2048": 601.3036194924304, "train_eval/loss_avg_len_1024": 6.403366397288846, "train_eval/perplexity_len_1024": 603.8745014496265, "train_eval/loss_avg_len_512": 6.409683007578133, "train_eval/perplexity_len_512": 607.7010139099035}
|
| 3 |
+
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1673.719632464985, "train_eval/train_update_time": 1038.808761139284, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.693106125889135, "train_eval/perplexity_len_2048": 296.8141323485163, "train_eval/loss_avg_len_1024": 5.698990526291018, "train_eval/perplexity_len_1024": 298.5658544105799, "train_eval/loss_avg_len_512": 5.710699294427177, "train_eval/perplexity_len_512": 302.08223886516663}
|
| 4 |
+
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2200.08279568702, "train_eval/train_update_time": 1383.788816100161, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.296922603823786, "train_eval/perplexity_len_2048": 199.7212419010431, "train_eval/loss_avg_len_1024": 5.305337436088958, "train_eval/perplexity_len_1024": 201.40895359804367, "train_eval/loss_avg_len_512": 5.320490509328956, "train_eval/perplexity_len_512": 204.48415878511435}
|
| 5 |
+
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2818.422068757005, "train_eval/train_update_time": 1728.7950774162891, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.045304426316234, "train_eval/perplexity_len_2048": 155.29156684287375, "train_eval/loss_avg_len_1024": 5.053415232166262, "train_eval/perplexity_len_1024": 156.55622837075202, "train_eval/loss_avg_len_512": 5.070610678311423, "train_eval/perplexity_len_512": 159.27156133906544}
|
| 6 |
+
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3345.161585650989, "train_eval/train_update_time": 2073.822410382272, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.881278812076035, "train_eval/perplexity_len_2048": 131.79910244606154, "train_eval/loss_avg_len_1024": 4.889370008184379, "train_eval/perplexity_len_1024": 132.86984076618447, "train_eval/loss_avg_len_512": 4.908291251527554, "train_eval/perplexity_len_512": 135.40783867497828}
|
| 7 |
+
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3962.514591771993, "train_eval/train_update_time": 2418.8274328135885, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.752543167285239, "train_eval/perplexity_len_2048": 115.87860879757943, "train_eval/loss_avg_len_1024": 4.763826194274043, "train_eval/perplexity_len_1024": 117.19347414945925, "train_eval/loss_avg_len_512": 4.785651780011176, "train_eval/perplexity_len_512": 119.77940747075029}
|
| 8 |
+
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4489.443470049009, "train_eval/train_update_time": 2763.8374379616, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.653148743151705, "train_eval/perplexity_len_2048": 104.91481583709675, "train_eval/loss_avg_len_1024": 4.6641259991965125, "train_eval/perplexity_len_1024": 106.07283695212364, "train_eval/loss_avg_len_512": 4.687856853806879, "train_eval/perplexity_len_512": 108.62014133645553}
|
| 9 |
+
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5107.660124632996, "train_eval/train_update_time": 3108.876222961524, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.56949279251452, "train_eval/perplexity_len_2048": 96.49515429403105, "train_eval/loss_avg_len_1024": 4.584133220926888, "train_eval/perplexity_len_1024": 97.91827683495046, "train_eval/loss_avg_len_512": 4.612269650588205, "train_eval/perplexity_len_512": 100.7124725543258}
|
| 10 |
+
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5634.922143466014, "train_eval/train_update_time": 3453.9653959476273, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.500111393837432, "train_eval/perplexity_len_2048": 90.02715921272548, "train_eval/loss_avg_len_1024": 4.5146006559921075, "train_eval/perplexity_len_1024": 91.34108222421936, "train_eval/loss_avg_len_512": 4.545015140839531, "train_eval/perplexity_len_512": 94.16185288811836}
|
| 11 |
+
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6252.330847234989, "train_eval/train_update_time": 3799.045364828722, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.434400985772954, "train_eval/perplexity_len_2048": 84.30161189591196, "train_eval/loss_avg_len_1024": 4.448710203694063, "train_eval/perplexity_len_1024": 85.51657387892722, "train_eval/loss_avg_len_512": 4.479653784418624, "train_eval/perplexity_len_512": 88.20412974467796}
|
| 12 |
+
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6778.730279813986, "train_eval/train_update_time": 4144.1211769738, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.3785154093765595, "train_eval/perplexity_len_2048": 79.71959454765936, "train_eval/loss_avg_len_1024": 4.3939165947328, "train_eval/perplexity_len_1024": 80.95687412946403, "train_eval/loss_avg_len_512": 4.429078622167507, "train_eval/perplexity_len_512": 83.85411997858606}
|
| 13 |
+
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7395.902606271033, "train_eval/train_update_time": 4489.238088100974, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.332845308472933, "train_eval/perplexity_len_2048": 76.16067919713348, "train_eval/loss_avg_len_1024": 4.354710251906472, "train_eval/perplexity_len_1024": 77.84426684093609, "train_eval/loss_avg_len_512": 4.39427122400477, "train_eval/perplexity_len_512": 80.98558889804535}
|
| 14 |
+
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7922.504303377005, "train_eval/train_update_time": 4834.2961148990435, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.286469663051848, "train_eval/perplexity_len_2048": 72.70932644602816, "train_eval/loss_avg_len_1024": 4.309569447513205, "train_eval/perplexity_len_1024": 74.40844530144317, "train_eval/loss_avg_len_512": 4.354222116721867, "train_eval/perplexity_len_512": 77.80627758807101}
|
| 15 |
+
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8540.947155133996, "train_eval/train_update_time": 5179.397799044964, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.249375205130373, "train_eval/perplexity_len_2048": 70.06162452534807, "train_eval/loss_avg_len_1024": 4.273874875287056, "train_eval/perplexity_len_1024": 71.7993106682297, "train_eval/loss_avg_len_512": 4.321713214736082, "train_eval/perplexity_len_512": 75.31755296428902}
|
| 16 |
+
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9067.837796505017, "train_eval/train_update_time": 5524.4823192786425, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.2222635463871026, "train_eval/perplexity_len_2048": 68.18765565818079, "train_eval/loss_avg_len_1024": 4.249677161750205, "train_eval/perplexity_len_1024": 70.08278329102363, "train_eval/loss_avg_len_512": 4.299936973010299, "train_eval/perplexity_len_512": 73.69514876983682}
|
| 17 |
+
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9685.574564223003, "train_eval/train_update_time": 5869.583213592763, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.195244031412512, "train_eval/perplexity_len_2048": 66.36992594802435, "train_eval/loss_avg_len_1024": 4.218284104051746, "train_eval/perplexity_len_1024": 67.9168460075776, "train_eval/loss_avg_len_512": 4.2707039155407625, "train_eval/perplexity_len_512": 71.57199853357183}
|
| 18 |
+
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10212.780399380019, "train_eval/train_update_time": 6214.659892122727, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.188622776587208, "train_eval/perplexity_len_2048": 65.93192541236388, "train_eval/loss_avg_len_1024": 4.215066284977402, "train_eval/perplexity_len_1024": 67.69865312590402, "train_eval/loss_avg_len_512": 4.268878927308142, "train_eval/perplexity_len_512": 71.4414995941971}
|
| 19 |
+
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10830.689001108985, "train_eval/train_update_time": 6559.779601458809, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.176184563983006, "train_eval/perplexity_len_2048": 65.11692916224894, "train_eval/loss_avg_len_1024": 4.205971465967996, "train_eval/perplexity_len_1024": 67.08573753155041, "train_eval/loss_avg_len_512": 4.259276238732308, "train_eval/perplexity_len_512": 70.75875247262296}
|
metrics/jsonlines/val.jsonl
CHANGED
|
@@ -1,49 +1,49 @@
|
|
| 1 |
-
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 142.
|
| 2 |
-
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time":
|
| 3 |
-
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 599.
|
| 4 |
-
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time":
|
| 5 |
-
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time":
|
| 6 |
-
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time":
|
| 7 |
-
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time":
|
| 8 |
-
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time":
|
| 9 |
-
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time":
|
| 10 |
-
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time":
|
| 11 |
-
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time":
|
| 12 |
-
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time":
|
| 13 |
-
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time":
|
| 14 |
-
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time":
|
| 15 |
-
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time":
|
| 16 |
-
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time":
|
| 17 |
-
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time":
|
| 18 |
-
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time":
|
| 19 |
-
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time":
|
| 20 |
-
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time":
|
| 21 |
-
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time":
|
| 22 |
-
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time":
|
| 23 |
-
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time":
|
| 24 |
-
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time":
|
| 25 |
-
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time":
|
| 26 |
-
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time":
|
| 27 |
-
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time":
|
| 28 |
-
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time":
|
| 29 |
-
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time":
|
| 30 |
-
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time":
|
| 31 |
-
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time":
|
| 32 |
-
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time":
|
| 33 |
-
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time":
|
| 34 |
-
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time":
|
| 35 |
-
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time":
|
| 36 |
-
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time":
|
| 37 |
-
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time":
|
| 38 |
-
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time":
|
| 39 |
-
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time":
|
| 40 |
-
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time":
|
| 41 |
-
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time":
|
| 42 |
-
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time":
|
| 43 |
-
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time":
|
| 44 |
-
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time":
|
| 45 |
-
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time":
|
| 46 |
-
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time":
|
| 47 |
-
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time":
|
| 48 |
-
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time":
|
| 49 |
-
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time":
|
|
|
|
| 1 |
+
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 142.2743778140284, "val/train_update_time": 141.94980711926473, "val/loss": 8.017322944736389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.2959264080273, "val/val_tokens_per_second": 453619.5776419728, "val/loss_avg_len_2048": 8.017322944736389, "val/perplexity_len_2048": 3033.046820927388, "val/loss_avg_len_1024": 8.016116743054521, "val/perplexity_len_1024": 3029.3905602879668, "val/loss_avg_len_512": 8.016581874255465, "val/perplexity_len_512": 3030.799952108046}
|
| 2 |
+
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 370.84139602299547, "val/train_update_time": 279.98310620122356, "val/loss": 7.168872293418506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32955138100078, "val/val_tokens_per_second": 453450.71877125703, "val/loss_avg_len_2048": 7.168872293418506, "val/perplexity_len_2048": 1298.379585700498, "val/loss_avg_len_1024": 7.169298829473462, "val/perplexity_len_1024": 1298.933509532663, "val/loss_avg_len_512": 7.17260874950029, "val/perplexity_len_512": 1303.2399987050917}
|
| 3 |
+
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 599.4013552149991, "val/train_update_time": 417.94741392938886, "val/loss": 6.680456670384901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34830521600088, "val/val_tokens_per_second": 453356.5948146407, "val/loss_avg_len_2048": 6.680456670384901, "val/perplexity_len_2048": 796.6828504192507, "val/loss_avg_len_1024": 6.681968356456887, "val/perplexity_len_1024": 797.8880955346282, "val/loss_avg_len_512": 6.6880630861138926, "val/perplexity_len_512": 802.7658569931743}
|
| 4 |
+
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 827.9917874370003, "val/train_update_time": 555.9067850944703, "val/loss": 6.256492450360163, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.29714147699997, "val/val_tokens_per_second": 453613.47358302725, "val/loss_avg_len_2048": 6.256492450360163, "val/perplexity_len_2048": 521.3869384996046, "val/loss_avg_len_1024": 6.25937858268139, "val/perplexity_len_1024": 522.8939037992483, "val/loss_avg_len_512": 6.268213871597686, "val/perplexity_len_512": 527.5342919101196}
|
| 5 |
+
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1056.5339453950291, "val/train_update_time": 693.8810286904918, "val/loss": 5.9596897887737725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28463066503173, "val/val_tokens_per_second": 453676.33115726174, "val/loss_avg_len_2048": 5.9596897887737725, "val/perplexity_len_2048": 387.48990187397294, "val/loss_avg_len_1024": 5.963750460020918, "val/perplexity_len_1024": 389.0665699760066, "val/loss_avg_len_512": 5.9747771193729715, "val/perplexity_len_512": 393.38041444619915}
|
| 6 |
+
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1285.4777986719855, "val/train_update_time": 831.8405811304692, "val/loss": 5.729621500730747, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.29542479297379, "val/val_tokens_per_second": 453622.09761913924, "val/loss_avg_len_2048": 5.729621500730747, "val/perplexity_len_2048": 307.8527242916948, "val/loss_avg_len_1024": 5.73466736189276, "val/perplexity_len_1024": 309.4100320720618, "val/loss_avg_len_512": 5.747293829907757, "val/perplexity_len_512": 313.34155634560165}
|
| 7 |
+
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1514.0213319549803, "val/train_update_time": 969.8139603384188, "val/loss": 5.54191019657671, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.56256327196024, "val/val_tokens_per_second": 452284.01803288993, "val/loss_avg_len_2048": 5.54191019657671, "val/perplexity_len_2048": 255.1649494383086, "val/loss_avg_len_1024": 5.5479404277496975, "val/perplexity_len_1024": 256.70830177953565, "val/loss_avg_len_512": 5.5618576472472405, "val/perplexity_len_512": 260.3059440825885}
|
| 8 |
+
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1742.8283430220326, "val/train_update_time": 1107.798082921363, "val/loss": 5.395747513790498, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.40836303203832, "val/val_tokens_per_second": 453055.43233301176, "val/loss_avg_len_2048": 5.395747513790498, "val/perplexity_len_2048": 220.46688755473716, "val/loss_avg_len_1024": 5.40283216586914, "val/perplexity_len_1024": 222.03436470678773, "val/loss_avg_len_512": 5.417992734318786, "val/perplexity_len_512": 225.42617783355703}
|
| 9 |
+
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1971.4918935780297, "val/train_update_time": 1245.7879514921806, "val/loss": 5.257520105597726, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32240011100657, "val/val_tokens_per_second": 453486.62070162, "val/loss_avg_len_2048": 5.257520105597726, "val/perplexity_len_2048": 192.0047489041577, "val/loss_avg_len_1024": 5.265500482419599, "val/perplexity_len_1024": 193.54314949562067, "val/loss_avg_len_512": 5.282038998350409, "val/perplexity_len_512": 196.77068168657516}
|
| 10 |
+
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2200.08279568702, "val/train_update_time": 1383.788816100161, "val/loss": 5.150704617314763, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.40961887600133, "val/val_tokens_per_second": 453049.1391206669, "val/loss_avg_len_2048": 5.150704617314763, "val/perplexity_len_2048": 172.55303134546992, "val/loss_avg_len_1024": 5.1593652144801805, "val/perplexity_len_1024": 174.0539336132167, "val/loss_avg_len_512": 5.177391785788723, "val/perplexity_len_512": 177.21998000419174}
|
| 11 |
+
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2429.163681267004, "val/train_update_time": 1521.7770175782498, "val/loss": 5.0635993114376445, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.79174483998213, "val/val_tokens_per_second": 451142.3375791581, "val/loss_avg_len_2048": 5.0635993114376445, "val/perplexity_len_2048": 158.15875569300152, "val/loss_avg_len_1024": 5.0730407805304045, "val/perplexity_len_1024": 159.6590781757551, "val/loss_avg_len_512": 5.092240632939898, "val/perplexity_len_512": 162.75412606608745}
|
| 12 |
+
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2658.232374906016, "val/train_update_time": 1659.7850940762437, "val/loss": 4.98549556239089, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 91.04536901297979, "val/val_tokens_per_second": 449885.5948857825, "val/loss_avg_len_2048": 4.98549556239089, "val/perplexity_len_2048": 146.2760459748089, "val/loss_avg_len_1024": 4.995756369349081, "val/perplexity_len_1024": 147.78468292514813, "val/loss_avg_len_512": 5.016161771441624, "val/perplexity_len_512": 150.8312664737655}
|
| 13 |
+
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2887.563738073979, "val/train_update_time": 1797.8180341873667, "val/loss": 4.916477123672562, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.70920936204493, "val/val_tokens_per_second": 451552.82785585296, "val/loss_avg_len_2048": 4.916477123672562, "val/perplexity_len_2048": 136.5208190724984, "val/loss_avg_len_1024": 4.927128413101426, "val/perplexity_len_1024": 137.98271353908035, "val/loss_avg_len_512": 4.948208645739966, "val/perplexity_len_512": 140.92229592495846}
|
| 14 |
+
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3116.5153146539815, "val/train_update_time": 1935.8127364134416, "val/loss": 4.863091215804801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.4004945270135, "val/val_tokens_per_second": 453094.86650828354, "val/loss_avg_len_2048": 4.863091215804801, "val/perplexity_len_2048": 129.42366084863215, "val/loss_avg_len_1024": 4.874493102245079, "val/perplexity_len_1024": 130.9077795303594, "val/loss_avg_len_512": 4.896728463353682, "val/perplexity_len_512": 133.85116361495074}
|
| 15 |
+
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3345.161585650989, "val/train_update_time": 2073.822410382272, "val/loss": 4.811523659892753, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.22876474499935, "val/val_tokens_per_second": 453957.2287813026, "val/loss_avg_len_2048": 4.811523659892753, "val/perplexity_len_2048": 122.91876129597873, "val/loss_avg_len_1024": 4.8232065941833895, "val/perplexity_len_1024": 124.36323452041226, "val/loss_avg_len_512": 4.846166890252475, "val/perplexity_len_512": 127.2516841422241}
|
| 16 |
+
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3574.040140778001, "val/train_update_time": 2211.8163213434746, "val/loss": 4.760587245357363, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51972289703554, "val/val_tokens_per_second": 452498.07101808325, "val/loss_avg_len_2048": 4.760587245357363, "val/perplexity_len_2048": 116.8145045362375, "val/loss_avg_len_1024": 4.77283736684951, "val/perplexity_len_1024": 118.25429722128945, "val/loss_avg_len_512": 4.796683278769628, "val/perplexity_len_512": 121.10806894510807}
|
| 17 |
+
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3802.836212512979, "val/train_update_time": 2349.8270930235158, "val/loss": 4.719228506370658, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.55211566400249, "val/val_tokens_per_second": 452336.2010887061, "val/loss_avg_len_2048": 4.719228506370658, "val/perplexity_len_2048": 112.08174894828639, "val/loss_avg_len_1024": 4.73204894817751, "val/perplexity_len_1024": 113.52793706523403, "val/loss_avg_len_512": 4.756577379063424, "val/perplexity_len_512": 116.3470318696743}
|
| 18 |
+
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4031.6405439740047, "val/train_update_time": 2487.827474080492, "val/loss": 4.676367494543736, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48990405898076, "val/val_tokens_per_second": 452647.18120711594, "val/loss_avg_len_2048": 4.676367494543736, "val/perplexity_len_2048": 107.37930735283909, "val/loss_avg_len_1024": 4.689829182334012, "val/perplexity_len_1024": 108.8345873493113, "val/loss_avg_len_512": 4.7154578478252525, "val/perplexity_len_512": 111.65992272494637}
|
| 19 |
+
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4260.37410283502, "val/train_update_time": 2625.818530491437, "val/loss": 4.640193889026716, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.78033580299234, "val/val_tokens_per_second": 451199.03597723704, "val/loss_avg_len_2048": 4.640193889026716, "val/perplexity_len_2048": 103.56442564245071, "val/loss_avg_len_1024": 4.654089609145093, "val/perplexity_len_1024": 105.01357307089665, "val/loss_avg_len_512": 4.680419558078237, "val/perplexity_len_512": 107.81529786259468}
|
| 20 |
+
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4489.443470049009, "val/train_update_time": 2763.8374379616, "val/loss": 4.608071265847772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.74466375104384, "val/val_tokens_per_second": 451376.40393238916, "val/loss_avg_len_2048": 4.608071265847772, "val/perplexity_len_2048": 100.29052920641857, "val/loss_avg_len_1024": 4.622682944629249, "val/perplexity_len_1024": 101.76670061160816, "val/loss_avg_len_512": 4.650229472655617, "val/perplexity_len_512": 104.60898772530528}
|
| 21 |
+
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4718.872143707995, "val/train_update_time": 2901.8397621414624, "val/loss": 4.577349349257373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.85865825600922, "val/val_tokens_per_second": 450810.09103819757, "val/loss_avg_len_2048": 4.577349349257373, "val/perplexity_len_2048": 97.25625986875657, "val/loss_avg_len_1024": 4.592617217212357, "val/perplexity_len_1024": 98.75254910922418, "val/loss_avg_len_512": 4.621059415361099, "val/perplexity_len_512": 101.60161344282939}
|
| 22 |
+
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4947.998132903012, "val/train_update_time": 3039.859961154347, "val/loss": 4.549797477854183, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51020548597444, "val/val_tokens_per_second": 452545.65250486817, "val/loss_avg_len_2048": 4.549797477854183, "val/perplexity_len_2048": 94.61324509708179, "val/loss_avg_len_1024": 4.565505847024965, "val/perplexity_len_1024": 96.11119928630141, "val/loss_avg_len_512": 4.594841379802302, "val/perplexity_len_512": 98.97243527526068}
|
| 23 |
+
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5176.88071361999, "val/train_update_time": 3177.899590641551, "val/loss": 4.5204342533537885, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.75762628903612, "val/val_tokens_per_second": 451311.9356995361, "val/loss_avg_len_2048": 4.5204342533537885, "val/perplexity_len_2048": 91.87548655482323, "val/loss_avg_len_1024": 4.536794685186399, "val/perplexity_len_1024": 93.39097238779135, "val/loss_avg_len_512": 4.567255290885735, "val/perplexity_len_512": 96.27948759639831}
|
| 24 |
+
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5405.913057841011, "val/train_update_time": 3315.925435980549, "val/loss": 4.492667575135734, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.73625983402599, "val/val_tokens_per_second": 451418.2100399961, "val/loss_avg_len_2048": 4.492667575135734, "val/perplexity_len_2048": 89.35950140608207, "val/loss_avg_len_1024": 4.509617001681402, "val/perplexity_len_1024": 90.88700227462165, "val/loss_avg_len_512": 4.5411422349753785, "val/perplexity_len_512": 93.7978781707472}
|
| 25 |
+
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5634.922143466014, "val/train_update_time": 3453.9653959476273, "val/loss": 4.4686831552135295, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.4348212070181, "val/val_tokens_per_second": 452922.88361179776, "val/loss_avg_len_2048": 4.4686831552135295, "val/perplexity_len_2048": 87.24176347672332, "val/loss_avg_len_1024": 4.486252108311607, "val/perplexity_len_1024": 88.78805350190629, "val/loss_avg_len_512": 4.51881691169506, "val/perplexity_len_512": 91.72701260192157}
|
| 26 |
+
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5864.05088400899, "val/train_update_time": 3591.9979424396297, "val/loss": 4.4460520937834875, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.54608423600439, "val/val_tokens_per_second": 452366.33196902875, "val/loss_avg_len_2048": 4.4460520937834875, "val/perplexity_len_2048": 85.28956326961926, "val/loss_avg_len_1024": 4.464425405966584, "val/perplexity_len_1024": 86.87109958090244, "val/loss_avg_len_512": 4.498216118935217, "val/perplexity_len_512": 89.85669458704189}
|
| 27 |
+
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6092.851395011006, "val/train_update_time": 3730.020544492756, "val/loss": 4.420050854198402, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32853831601096, "val/val_tokens_per_second": 453455.8043738402, "val/loss_avg_len_2048": 4.420050854198402, "val/perplexity_len_2048": 83.10051126077735, "val/loss_avg_len_1024": 4.439237378784268, "val/perplexity_len_1024": 84.71031515065614, "val/loss_avg_len_512": 4.474183511526417, "val/perplexity_len_512": 87.7229463868}
|
| 28 |
+
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6321.474026971031, "val/train_update_time": 3868.0566598027945, "val/loss": 4.398202258219151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34568081604084, "val/val_tokens_per_second": 453369.7641108214, "val/loss_avg_len_2048": 4.398202258219151, "val/perplexity_len_2048": 81.30457257597935, "val/loss_avg_len_1024": 4.418109852228035, "val/perplexity_len_1024": 82.93936944356574, "val/loss_avg_len_512": 4.454481553460378, "val/perplexity_len_512": 86.01154689490046}
|
| 29 |
+
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6550.097681444022, "val/train_update_time": 4006.072604118788, "val/loss": 4.376139390771115, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32946746400557, "val/val_tokens_per_second": 453451.1400315928, "val/loss_avg_len_2048": 4.376139390771115, "val/perplexity_len_2048": 79.53040415674566, "val/loss_avg_len_1024": 4.396944615813718, "val/perplexity_len_1024": 81.2023847690807, "val/loss_avg_len_512": 4.435031853418239, "val/perplexity_len_512": 84.35481183459801}
|
| 30 |
+
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6778.730279813986, "val/train_update_time": 4144.1211769738, "val/loss": 4.355563867319981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.37096956197638, "val/val_tokens_per_second": 453242.89645813353, "val/loss_avg_len_2048": 4.355563867319981, "val/perplexity_len_2048": 77.9107442760098, "val/loss_avg_len_1024": 4.377231672070688, "val/perplexity_len_1024": 79.61732119023712, "val/loss_avg_len_512": 4.4167310255174534, "val/perplexity_len_512": 82.82508923002905}
|
| 31 |
+
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 7007.788445023994, "val/train_update_time": 4282.135617908789, "val/loss": 4.335396474500792, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.33323211199604, "val/val_tokens_per_second": 453432.2424024127, "val/loss_avg_len_2048": 4.335396474500792, "val/perplexity_len_2048": 76.35522578937196, "val/loss_avg_len_1024": 4.358097750052391, "val/perplexity_len_1024": 78.10841129235861, "val/loss_avg_len_512": 4.399293633644097, "val/perplexity_len_512": 81.3933548269956}
|
| 32 |
+
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7236.399417778011, "val/train_update_time": 4420.153171113925, "val/loss": 4.31625511668981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.29227029101457, "val/val_tokens_per_second": 453637.94561798865, "val/loss_avg_len_2048": 4.31625511668981, "val/perplexity_len_2048": 74.90758222363428, "val/loss_avg_len_1024": 4.339928405715012, "val/perplexity_len_1024": 76.70204771345033, "val/loss_avg_len_512": 4.382818944332655, "val/perplexity_len_512": 80.06340988951868}
|
| 33 |
+
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7465.047058130032, "val/train_update_time": 4558.262438459904, "val/loss": 4.2996819401991555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.42316870903596, "val/val_tokens_per_second": 452981.25010196504, "val/loss_avg_len_2048": 4.2996819401991555, "val/perplexity_len_2048": 73.67635648530485, "val/loss_avg_len_1024": 4.324086923091021, "val/perplexity_len_1024": 75.49654722507492, "val/loss_avg_len_512": 4.368286226595659, "val/perplexity_len_512": 78.90828483582929}
|
| 34 |
+
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7693.75585325103, "val/train_update_time": 4696.292382065032, "val/loss": 4.283236857734924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48378002701793, "val/val_tokens_per_second": 452677.8168172194, "val/loss_avg_len_2048": 4.283236857734924, "val/perplexity_len_2048": 72.47465088349, "val/loss_avg_len_1024": 4.308458720062673, "val/perplexity_len_1024": 74.32584368113129, "val/loss_avg_len_512": 4.3540999811033725, "val/perplexity_len_512": 77.7967752505338}
|
| 35 |
+
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 7922.504303377005, "val/train_update_time": 4834.2961148990435, "val/loss": 4.267898958559893, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.49312097602524, "val/val_tokens_per_second": 452631.0901670827, "val/loss_avg_len_2048": 4.267898958559893, "val/perplexity_len_2048": 71.371523450084, "val/loss_avg_len_1024": 4.29405301307696, "val/perplexity_len_1024": 73.26280266811683, "val/loss_avg_len_512": 4.3412228272167965, "val/perplexity_len_512": 76.80139677915308}
|
| 36 |
+
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8151.724266513018, "val/train_update_time": 4972.329616118164, "val/loss": 4.25385695036254, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.50966985698324, "val/val_tokens_per_second": 452548.33063386485, "val/loss_avg_len_2048": 4.25385695036254, "val/perplexity_len_2048": 70.3763275596729, "val/loss_avg_len_1024": 4.280959435730102, "val/perplexity_len_1024": 72.30978332653642, "val/loss_avg_len_512": 4.3297005797375, "val/perplexity_len_512": 75.92155071492496}
|
| 37 |
+
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8380.818926771986, "val/train_update_time": 5110.374592272972, "val/loss": 4.241492192271679, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.75462737603812, "val/val_tokens_per_second": 451326.8489361309, "val/loss_avg_len_2048": 4.241492192271679, "val/perplexity_len_2048": 69.51149901038498, "val/loss_avg_len_1024": 4.26916799461185, "val/perplexity_len_1024": 71.46215398096777, "val/loss_avg_len_512": 4.318984106020722, "val/perplexity_len_512": 75.11228340295357}
|
| 38 |
+
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8610.333553528006, "val/train_update_time": 5248.407137244998, "val/loss": 4.230574601543066, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34580848604674, "val/val_tokens_per_second": 453369.1234422456, "val/loss_avg_len_2048": 4.230574601543066, "val/perplexity_len_2048": 68.75672854774076, "val/loss_avg_len_1024": 4.259278581639892, "val/perplexity_len_1024": 70.75891825403492, "val/loss_avg_len_512": 4.310739070640505, "val/perplexity_len_512": 74.49552605583897}
|
| 39 |
+
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 8839.131734684983, "val/train_update_time": 5386.445536848798, "val/loss": 4.220437906114012, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.38471865397878, "val/val_tokens_per_second": 453173.9503090982, "val/loss_avg_len_2048": 4.220437906114012, "val/perplexity_len_2048": 68.06328309221031, "val/loss_avg_len_1024": 4.2494930887183635, "val/perplexity_len_1024": 70.0698841278538, "val/loss_avg_len_512": 4.301655115112848, "val/perplexity_len_512": 73.82187634450761}
|
| 40 |
+
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9067.837796505017, "val/train_update_time": 5524.4823192786425, "val/loss": 4.212082446529414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44228035700507, "val/val_tokens_per_second": 452885.5291830056, "val/loss_avg_len_2048": 4.212082446529414, "val/perplexity_len_2048": 67.49695235274098, "val/loss_avg_len_1024": 4.241754451114219, "val/perplexity_len_1024": 69.52973140635773, "val/loss_avg_len_512": 4.295026430321672, "val/perplexity_len_512": 73.33415266465448}
|
| 41 |
+
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9296.993160478014, "val/train_update_time": 5662.513914024632, "val/loss": 4.20452369724242, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.35833089798689, "val/val_tokens_per_second": 453306.29276721796, "val/loss_avg_len_2048": 4.20452369724242, "val/perplexity_len_2048": 66.9886831719058, "val/loss_avg_len_1024": 4.23449458040651, "val/perplexity_len_1024": 69.02678242730825, "val/loss_avg_len_512": 4.288312720157765, "val/perplexity_len_512": 72.84345745438628}
|
| 42 |
+
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9525.670015413023, "val/train_update_time": 5800.553074025724, "val/loss": 4.198343608177501, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.72886659303913, "val/val_tokens_per_second": 451454.99484441394, "val/loss_avg_len_2048": 4.198343608177501, "val/perplexity_len_2048": 66.57596377846592, "val/loss_avg_len_1024": 4.228689314186201, "val/perplexity_len_1024": 68.62722447126744, "val/loss_avg_len_512": 4.283119718784839, "val/perplexity_len_512": 72.46616177619002}
|
| 43 |
+
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 9754.735574804014, "val/train_update_time": 5938.594816011784, "val/loss": 4.193367760937754, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.7492567790323, "val/val_tokens_per_second": 451353.5587375064, "val/loss_avg_len_2048": 4.193367760937754, "val/perplexity_len_2048": 66.24551476656741, "val/loss_avg_len_1024": 4.224034787101439, "val/perplexity_len_1024": 68.30853943562786, "val/loss_avg_len_512": 4.2790641182546505, "val/perplexity_len_512": 72.17286312516411}
|
| 44 |
+
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 9983.788254795014, "val/train_update_time": 6076.624669670709, "val/loss": 4.189421963141696, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.6789614429581, "val/val_tokens_per_second": 451703.4530194308, "val/loss_avg_len_2048": 4.189421963141696, "val/perplexity_len_2048": 65.98463838160738, "val/loss_avg_len_1024": 4.22020412462051, "val/perplexity_len_1024": 68.04737301604953, "val/loss_avg_len_512": 4.2755115066579545, "val/perplexity_len_512": 71.9169158844202}
|
| 45 |
+
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10212.780399380019, "val/train_update_time": 6214.659892122727, "val/loss": 4.186491362652555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.99573383899406, "val/val_tokens_per_second": 450130.99265152105, "val/loss_avg_len_2048": 4.186491362652555, "val/perplexity_len_2048": 65.79154684336461, "val/loss_avg_len_1024": 4.217514740810637, "val/perplexity_len_1024": 67.86461337831629, "val/loss_avg_len_512": 4.273214724269416, "val/perplexity_len_512": 71.75192792183111}
|
| 46 |
+
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10442.503988512035, "val/train_update_time": 6352.692734938697, "val/loss": 4.184350719433324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.35145655297674, "val/val_tokens_per_second": 453340.7823479136, "val/loss_avg_len_2048": 4.184350719433324, "val/perplexity_len_2048": 65.65086124728785, "val/loss_avg_len_1024": 4.215543735674023, "val/perplexity_len_1024": 67.73098361249181, "val/loss_avg_len_512": 4.271550920667592, "val/perplexity_len_512": 71.63264606402603}
|
| 47 |
+
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 10671.172117165988, "val/train_update_time": 6490.743322437804, "val/loss": 4.183023557277815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.35044766304782, "val/val_tokens_per_second": 453345.84453588846, "val/loss_avg_len_2048": 4.183023557277815, "val/perplexity_len_2048": 65.56378970057509, "val/loss_avg_len_1024": 4.214286808201578, "val/perplexity_len_1024": 67.64590415900594, "val/loss_avg_len_512": 4.270415190260951, "val/perplexity_len_512": 71.5513368711842}
|
| 48 |
+
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 10899.877774502034, "val/train_update_time": 6628.827098570764, "val/loss": 4.18224084139755, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.3038979700068, "val/val_tokens_per_second": 453579.5344471653, "val/loss_avg_len_2048": 4.18224084139755, "val/perplexity_len_2048": 65.51249195960324, "val/loss_avg_len_1024": 4.213500716115302, "val/perplexity_len_1024": 67.59274914418302, "val/loss_avg_len_512": 4.269666781060863, "val/perplexity_len_512": 71.4978072259293}
|
| 49 |
+
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 11128.507790005999, "val/train_update_time": 6766.880580478697, "val/loss": 4.1819093990348515, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.11226414598059, "val/val_tokens_per_second": 454544.1221368645, "val/loss_avg_len_2048": 4.1819093990348515, "val/perplexity_len_2048": 65.49078194249032, "val/loss_avg_len_1024": 4.213224817466876, "val/perplexity_len_1024": 67.57410296839636, "val/loss_avg_len_512": 4.269427753666788, "val/perplexity_len_512": 71.48071933370453}
|
metrics/npz/train_eval/step-000000104857600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9689eac5628c24835fc067a08c12aec111f39bde034cf3fd21c7668771f4d7db
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ec2247a03177429e55dd2b9ceda3d231cc2e0960b9395ad6b32fc0d519d6741
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000314572800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91485872829a42b446fc33636ca81aaf924457b2225ad8734e1a014600c82608
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:084a127f64a422e079a1e1cb823e16982b71cfde6efe327926ff8443a7d1103f
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000524288000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:093682b0a72ed9596ee5ad8f1d9c072dc8bfd1109b841178fffc0079f6da747d
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000629145600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55cc048b994dd7b6f840628704ea5078a41b109fc5b316dd01e86db1354d68c1
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000734003200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb0bd6cd6832f911c8cb316614d97a5a78e3ccf57f81bb8661eedae16a00d3a5
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000838860800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3beb5c9d3934a6b0b781e5f1379dde43a64d949bb7c2190f5e290f31560510ee
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000943718400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86053e422f63b1997d274b16a3f83ac7569fa79aa471be938b2586c36625db40
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001048576000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af2d29d0306bec224d18de5a45f0464691fd99709d9f57efcb60b83827a2a070
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001153433600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32694215f459813a770d1887234917ad61c859451ab2759652a5239be3709c55
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001258291200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43801118158369cc7393c60089d61f2bfcebf85ed25ad1584c6f61643f972761
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001363148800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09815e89a3add650dfbb04ce74b556c9a72f6fa9f26a0a8ee7c66e5f57f16ab3
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001468006400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b329cd54e4297337c8d58c891196e499b4dd37f393f24917c51974a20ddeda4c
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001572864000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d8780b119443b7964d4fbea7e1aeb86d8b6e3042dbd5d9897f0aeed4a3904db
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001677721600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93492038097cdd944462fc5be8ff53d759259b491d011c3186149fb4733afcc5
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001782579200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85b31700063c6174b8eba53f2a2f4092996bed090528645ca7f551df1d469b5d
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001887436800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c088500cbce0377fa6b3e8ae84045fad4d537c103ad6b9f68f22caec46d05b50
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001992294400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e567eae3a2690aad2c2995b320aa57745942585bb3dfffd8b8b12597b5d42990
|
| 3 |
size 20540
|
metrics/npz/val/step-000000041943040.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:265ebd6b9bde03fad19b9aae2da0ee92c37b0e2b3dbc1950a6c290ebb70dfbe6
|
| 3 |
size 21142
|
metrics/npz/val/step-000000083886080.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9039a3eed82c460d492611be8fcc27793c9fbe43961bbb8927c70bd1cba92fcd
|
| 3 |
size 21142
|
metrics/npz/val/step-000000125829120.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f70311d41a03c97e86ff8121f0cc6cd7718d0f5d2bf971e20c9c948788c426a
|
| 3 |
size 21142
|
metrics/npz/val/step-000000167772160.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45e91b34732c55786979df2ad3607a299a35b28500234a086cb9872a03256017
|
| 3 |
size 21142
|
metrics/npz/val/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97d504910bf1a7e842b7815c694e883f77922067729b8db97ce8c12fa8eb1df3
|
| 3 |
size 21142
|
metrics/npz/val/step-000000251658240.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:099974346431907422f6336139eae6e7c28fe7d2a6d239fac706344d308f83da
|
| 3 |
size 21142
|
metrics/npz/val/step-000000293601280.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c443f47445e603996baef79dc33f7a8ba7ddc97ef5768399e25434f444d38842
|
| 3 |
size 21142
|
metrics/npz/val/step-000000335544320.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efb50ab44c317467b2371c4780ae8dd68fbebecee528f2962ef91d0a00261816
|
| 3 |
size 21142
|
metrics/npz/val/step-000000377487360.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb0bb42606b7b156ace7a718b7e9f7e33c9dd0325c36727a0cc691668c841e34
|
| 3 |
size 21142
|
metrics/npz/val/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc6d937e8cb1de87260bdd38940718e7a857f78649c680ec4c5c5f6ec33a611e
|
| 3 |
size 21142
|
metrics/npz/val/step-000000461373440.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61a56135666378ed61b59377e19614bf2f2be90cd92ab879fc59ef1293db99b6
|
| 3 |
size 21142
|
metrics/npz/val/step-000000503316480.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:015ceb1f82664e7252f6aba9eea58abae89d0e18a3e89d6f3f397eb093f65078
|
| 3 |
size 21142
|
metrics/npz/val/step-000000545259520.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29fe2be6d0a8e489c3dce27cfdd8321de5f7ec273b58ebc6a01e99c06b5b593c
|
| 3 |
size 21142
|
metrics/npz/val/step-000000587202560.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f03ae34480069799b507808a22e85ca3fb0927d25e49edcf2f3ea7547f200c0e
|
| 3 |
size 21142
|
metrics/npz/val/step-000000629145600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91b74904fd335c0978272b1f7c08464c1ce99cd005f7fac9f67794f1406de39c
|
| 3 |
size 21142
|
metrics/npz/val/step-000000671088640.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25e23b258f6f1656e3498984ef6a47393958cc5cc5f2b28bc1930464dd1fea41
|
| 3 |
size 21142
|