add remote code + model files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- __pycache__/__init__.cpython-310.pyc +0 -0
- __pycache__/configuration_transformer.cpython-310.pyc +0 -0
- __pycache__/modeling_transformer.cpython-310.pyc +0 -0
- checkpoints/step-000000209715200.pt +1 -1
- checkpoints/step-000000419430400.pt +1 -1
- checkpoints/step-000000629145600.pt +1 -1
- checkpoints/step-000000838860800.pt +1 -1
- checkpoints/step-000001048576000.pt +1 -1
- checkpoints/step-000001258291200.pt +1 -1
- checkpoints/step-000001468006400.pt +1 -1
- checkpoints/step-000001677721600.pt +1 -1
- checkpoints/step-000001887436800.pt +1 -1
- logs/2025-10-11_18-22-13.log +338 -0
- metrics/jsonlines/checkpoint.jsonl +9 -10
- metrics/jsonlines/norm.jsonl +0 -0
- metrics/jsonlines/throughput.jsonl +0 -0
- metrics/jsonlines/train.jsonl +98 -100
- metrics/jsonlines/train_data_info.jsonl +1 -1
- metrics/jsonlines/train_eval.jsonl +19 -20
- metrics/jsonlines/val.jsonl +49 -50
- metrics/npz/train_eval/step-000000104857600.npz +1 -1
- metrics/npz/train_eval/step-000000209715200.npz +1 -1
- metrics/npz/train_eval/step-000000314572800.npz +1 -1
- metrics/npz/train_eval/step-000000419430400.npz +1 -1
- metrics/npz/train_eval/step-000000524288000.npz +1 -1
- metrics/npz/train_eval/step-000000629145600.npz +1 -1
- metrics/npz/train_eval/step-000000734003200.npz +1 -1
- metrics/npz/train_eval/step-000000838860800.npz +1 -1
- metrics/npz/train_eval/step-000000943718400.npz +1 -1
- metrics/npz/train_eval/step-000001048576000.npz +1 -1
- metrics/npz/train_eval/step-000001153433600.npz +1 -1
- metrics/npz/train_eval/step-000001258291200.npz +1 -1
- metrics/npz/train_eval/step-000001363148800.npz +1 -1
- metrics/npz/train_eval/step-000001468006400.npz +1 -1
- metrics/npz/train_eval/step-000001572864000.npz +1 -1
- metrics/npz/train_eval/step-000001677721600.npz +1 -1
- metrics/npz/train_eval/step-000001782579200.npz +1 -1
- metrics/npz/train_eval/step-000001887436800.npz +1 -1
- metrics/npz/train_eval/step-000001992294400.npz +1 -1
- metrics/npz/val/step-000000041943040.npz +1 -1
- metrics/npz/val/step-000000083886080.npz +1 -1
- metrics/npz/val/step-000000125829120.npz +1 -1
- metrics/npz/val/step-000000167772160.npz +1 -1
- metrics/npz/val/step-000000209715200.npz +1 -1
- metrics/npz/val/step-000000251658240.npz +1 -1
- metrics/npz/val/step-000000293601280.npz +1 -1
- metrics/npz/val/step-000000335544320.npz +1 -1
- metrics/npz/val/step-000000377487360.npz +1 -1
- metrics/npz/val/step-000000419430400.npz +1 -1
- metrics/npz/val/step-000000461373440.npz +1 -1
__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/__init__.cpython-310.pyc and b/__pycache__/__init__.cpython-310.pyc differ
|
|
|
__pycache__/configuration_transformer.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/configuration_transformer.cpython-310.pyc and b/__pycache__/configuration_transformer.cpython-310.pyc differ
|
|
|
__pycache__/modeling_transformer.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/modeling_transformer.cpython-310.pyc and b/__pycache__/modeling_transformer.cpython-310.pyc differ
|
|
|
checkpoints/step-000000209715200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea89f778ededec7472baf7a219f883e92333236c66d9d677d49fa81988b0f7e5
|
| 3 |
size 329410498
|
checkpoints/step-000000419430400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef8110a29b497f69da13e35deda4a0d785a211d9d09c3a5bbfc331b305c2ec3f
|
| 3 |
size 329410498
|
checkpoints/step-000000629145600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22af72d4331025e64e17433768f06c9ef0b9ba2cdcd5d62a8aab9153821f5cef
|
| 3 |
size 329410498
|
checkpoints/step-000000838860800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5e9805446e6f290e9b32a69d915b404f3d389d50462e72da672d27fa6cfe2b4
|
| 3 |
size 329410498
|
checkpoints/step-000001048576000.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:106cebd91c20db5c04370b9101006b7e42d23ed67530eac965dc38d2d0069a2f
|
| 3 |
size 329410498
|
checkpoints/step-000001258291200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:451c2d44639298ad3d2c9c793c1a90836aadb59256aed9533586b850631444ed
|
| 3 |
size 329410498
|
checkpoints/step-000001468006400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58e4060950655caa8e692850b5b49c660edf35ba56ce1740eb931be72b419547
|
| 3 |
size 329410498
|
checkpoints/step-000001677721600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bce41c761242437b5479b99e0b37de14a3e5e1d8e7e3fe68b68c9920e2214c1
|
| 3 |
size 329410498
|
checkpoints/step-000001887436800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 329410498
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b02ab5cbc5d45f4144e1f6fbe4a8b86e8616b9d02d30cfb1c7652aa28bca9aa
|
| 3 |
size 329410498
|
logs/2025-10-11_18-22-13.log
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-10-11 18:22:13][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/transformer_2_4_256`
|
| 2 |
+
[2025-10-11 18:22:13][train:375][INFO] Configuration:
|
| 3 |
+
[2025-10-11 18:22:13][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/transformer_2_4_256/config.yaml.
|
| 4 |
+
[2025-10-11 18:22:13][train:387][INFO] creating datamodule
|
| 5 |
+
[2025-10-11 18:22:13][train:419][INFO] creating model
|
| 6 |
+
[2025-10-11 18:22:14][train:440][INFO] creating optimizer
|
| 7 |
+
[2025-10-11 18:22:14][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
|
| 8 |
+
[2025-10-11 18:22:14][logger:256][INFO] Setting up wandb logger...
|
| 9 |
+
[2025-10-11 18:22:14][logger:272][INFO] Not resuming. Creating a new wandb run.
|
| 10 |
+
[2025-10-11 18:22:15][logger:288][INFO] wandb initialized. Run id: 7pqyfnns
|
| 11 |
+
[2025-10-11 18:22:15][logger:186][INFO] Setting up jsonlines logger...
|
| 12 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/resume.jsonl since we are not resuming
|
| 13 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/train_data_info.jsonl since we are not resuming
|
| 14 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/val_data_info.jsonl since we are not resuming
|
| 15 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/model_info.jsonl since we are not resuming
|
| 16 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/train.jsonl since we are not resuming
|
| 17 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/throughput.jsonl since we are not resuming
|
| 18 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/norm.jsonl since we are not resuming
|
| 19 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/val.jsonl since we are not resuming
|
| 20 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/train_eval.jsonl since we are not resuming
|
| 21 |
+
[2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/checkpoint.jsonl since we are not resuming
|
| 22 |
+
[2025-10-11 18:22:15][logger:113][INFO] Setting up npz logger...
|
| 23 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000041943040.npz since we are not resuming
|
| 24 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000083886080.npz since we are not resuming
|
| 25 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000125829120.npz since we are not resuming
|
| 26 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000167772160.npz since we are not resuming
|
| 27 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000209715200.npz since we are not resuming
|
| 28 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000251658240.npz since we are not resuming
|
| 29 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000293601280.npz since we are not resuming
|
| 30 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000335544320.npz since we are not resuming
|
| 31 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000377487360.npz since we are not resuming
|
| 32 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000419430400.npz since we are not resuming
|
| 33 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000461373440.npz since we are not resuming
|
| 34 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000503316480.npz since we are not resuming
|
| 35 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000545259520.npz since we are not resuming
|
| 36 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000587202560.npz since we are not resuming
|
| 37 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000629145600.npz since we are not resuming
|
| 38 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000671088640.npz since we are not resuming
|
| 39 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000713031680.npz since we are not resuming
|
| 40 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000754974720.npz since we are not resuming
|
| 41 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000796917760.npz since we are not resuming
|
| 42 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000838860800.npz since we are not resuming
|
| 43 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000880803840.npz since we are not resuming
|
| 44 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000922746880.npz since we are not resuming
|
| 45 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000964689920.npz since we are not resuming
|
| 46 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001006632960.npz since we are not resuming
|
| 47 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001048576000.npz since we are not resuming
|
| 48 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001090519040.npz since we are not resuming
|
| 49 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001132462080.npz since we are not resuming
|
| 50 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001174405120.npz since we are not resuming
|
| 51 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001216348160.npz since we are not resuming
|
| 52 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001258291200.npz since we are not resuming
|
| 53 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001300234240.npz since we are not resuming
|
| 54 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001342177280.npz since we are not resuming
|
| 55 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001384120320.npz since we are not resuming
|
| 56 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001426063360.npz since we are not resuming
|
| 57 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001468006400.npz since we are not resuming
|
| 58 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001509949440.npz since we are not resuming
|
| 59 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001551892480.npz since we are not resuming
|
| 60 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001593835520.npz since we are not resuming
|
| 61 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001635778560.npz since we are not resuming
|
| 62 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001677721600.npz since we are not resuming
|
| 63 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001719664640.npz since we are not resuming
|
| 64 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001761607680.npz since we are not resuming
|
| 65 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001803550720.npz since we are not resuming
|
| 66 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001845493760.npz since we are not resuming
|
| 67 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001887436800.npz since we are not resuming
|
| 68 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001929379840.npz since we are not resuming
|
| 69 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001971322880.npz since we are not resuming
|
| 70 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000002013265920.npz since we are not resuming
|
| 71 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000002055208960.npz since we are not resuming
|
| 72 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000002097152000.npz since we are not resuming
|
| 73 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000104857600.npz since we are not resuming
|
| 74 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000209715200.npz since we are not resuming
|
| 75 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000314572800.npz since we are not resuming
|
| 76 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000419430400.npz since we are not resuming
|
| 77 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000524288000.npz since we are not resuming
|
| 78 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000629145600.npz since we are not resuming
|
| 79 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000734003200.npz since we are not resuming
|
| 80 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000838860800.npz since we are not resuming
|
| 81 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000943718400.npz since we are not resuming
|
| 82 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001048576000.npz since we are not resuming
|
| 83 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001153433600.npz since we are not resuming
|
| 84 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001258291200.npz since we are not resuming
|
| 85 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001363148800.npz since we are not resuming
|
| 86 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001468006400.npz since we are not resuming
|
| 87 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001572864000.npz since we are not resuming
|
| 88 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001677721600.npz since we are not resuming
|
| 89 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001782579200.npz since we are not resuming
|
| 90 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001887436800.npz since we are not resuming
|
| 91 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001992294400.npz since we are not resuming
|
| 92 |
+
[2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000002097152000.npz since we are not resuming
|
| 93 |
+
[2025-10-11 18:22:15][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
|
| 94 |
+
[2025-10-11 18:22:15][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
|
| 95 |
+
[2025-10-11 18:22:15][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128]
|
| 96 |
+
[2025-10-11 18:22:54][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:38] [ETA: 1:03:37] [loss: 9.773] [tokens/s: 603077.682] [batches/s: 0.288] [MFU: 0.000] [TFLOPS: 0.000]
|
| 97 |
+
[2025-10-11 18:23:29][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:13] [ETA: 0:59:51] [loss: 8.168] [tokens/s: 603404.871] [batches/s: 0.288] [MFU: 0.000] [TFLOPS: 0.000]
|
| 98 |
+
[2025-10-11 18:23:29][train:194][INFO] Running validation...
|
| 99 |
+
[2025-10-11 18:24:52][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 73.294] [val/train_update_time: 72.966] [val/loss: 8.067] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.724] [val/val_tokens_per_second: 489228.498] [val/loss_avg_len_2048: 8.067] [val/perplexity_len_2048: 3186.330] [val/loss_avg_len_1024: 8.063] [val/perplexity_len_1024: 3176.025] [val/loss_avg_len_512: 8.063] [val/perplexity_len_512: 3175.293]
|
| 100 |
+
[2025-10-11 18:25:27][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:03:11] [ETA: 1:43:21] [loss: 7.603] [tokens/s: 329572.484] [batches/s: 0.157] [MFU: 0.000] [TFLOPS: 0.000]
|
| 101 |
+
[2025-10-11 18:26:02][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:03:46] [ETA: 1:30:36] [loss: 7.274] [tokens/s: 372993.504] [batches/s: 0.178] [MFU: 0.000] [TFLOPS: 0.000]
|
| 102 |
+
[2025-10-11 18:26:02][train:194][INFO] Running validation...
|
| 103 |
+
[2025-10-11 18:27:26][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 226.536] [val/train_update_time: 142.216] [val/loss: 7.245] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.879] [val/val_tokens_per_second: 488322.377] [val/loss_avg_len_2048: 7.245] [val/perplexity_len_2048: 1401.492] [val/loss_avg_len_1024: 7.242] [val/perplexity_len_1024: 1397.308] [val/loss_avg_len_512: 7.246] [val/perplexity_len_512: 1402.335]
|
| 104 |
+
[2025-10-11 18:28:01][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:05:45] [ETA: 1:49:18] [loss: 7.020] [tokens/s: 304108.020] [batches/s: 0.145] [MFU: 0.000] [TFLOPS: 0.000]
|
| 105 |
+
[2025-10-11 18:28:01][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 345.167] [train_eval/train_update_time: 176.836] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.311] [train_eval/perplexity_len_2048: 4068.811] [train_eval/loss_avg_len_1024: 8.310] [train_eval/perplexity_len_1024: 4065.728] [train_eval/loss_avg_len_512: 8.312] [train_eval/perplexity_len_512: 4070.792]
|
| 106 |
+
[2025-10-11 18:28:35][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:06:19] [ETA: 1:39:12] [loss: 6.767] [tokens/s: 332023.735] [batches/s: 0.158] [MFU: 0.000] [TFLOPS: 0.000]
|
| 107 |
+
[2025-10-11 18:28:35][train:194][INFO] Running validation...
|
| 108 |
+
[2025-10-11 18:29:59][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 379.919] [val/train_update_time: 211.447] [val/loss: 6.769] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.947] [val/val_tokens_per_second: 487928.793] [val/loss_avg_len_2048: 6.769] [val/perplexity_len_2048: 870.420] [val/loss_avg_len_1024: 6.765] [val/perplexity_len_1024: 867.369] [val/loss_avg_len_512: 6.771] [val/perplexity_len_512: 872.359]
|
| 109 |
+
[2025-10-11 18:30:34][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:08:18] [ETA: 1:50:24] [loss: 6.605] [tokens/s: 294498.470] [batches/s: 0.140] [MFU: 0.000] [TFLOPS: 0.000]
|
| 110 |
+
[2025-10-11 18:31:09][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:08:53] [ETA: 1:42:13] [loss: 6.441] [tokens/s: 314916.038] [batches/s: 0.150] [MFU: 0.000] [TFLOPS: 0.000]
|
| 111 |
+
[2025-10-11 18:31:09][train:194][INFO] Running validation...
|
| 112 |
+
[2025-10-11 18:32:33][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 533.352] [val/train_update_time: 280.672] [val/loss: 6.397] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.748] [val/val_tokens_per_second: 489088.636] [val/loss_avg_len_2048: 6.397] [val/perplexity_len_2048: 600.142] [val/loss_avg_len_1024: 6.396] [val/perplexity_len_1024: 599.163] [val/loss_avg_len_512: 6.406] [val/perplexity_len_512: 605.769]
|
| 113 |
+
[2025-10-11 18:33:07][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:10:51] [ETA: 1:49:51] [loss: 6.236] [tokens/s: 289551.070] [batches/s: 0.138] [MFU: 0.000] [TFLOPS: 0.000]
|
| 114 |
+
[2025-10-11 18:33:42][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:11:26] [ETA: 1:42:59] [loss: 6.122] [tokens/s: 305610.676] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 115 |
+
[2025-10-11 18:33:42][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 686.614] [train_eval/train_update_time: 349.902] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.520] [train_eval/perplexity_len_2048: 678.573] [train_eval/loss_avg_len_1024: 6.520] [train_eval/perplexity_len_1024: 678.732] [train_eval/loss_avg_len_512: 6.529] [train_eval/perplexity_len_512: 684.423]
|
| 116 |
+
[2025-10-11 18:33:42][train:194][INFO] Running validation...
|
| 117 |
+
[2025-10-11 18:35:06][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 686.614] [val/train_update_time: 349.902] [val/loss: 6.105] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.813] [val/val_tokens_per_second: 488705.727] [val/loss_avg_len_2048: 6.105] [val/perplexity_len_2048: 448.202] [val/loss_avg_len_1024: 6.107] [val/perplexity_len_1024: 448.853] [val/loss_avg_len_512: 6.123] [val/perplexity_len_512: 456.028]
|
| 118 |
+
[2025-10-11 18:35:06][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000209715200.pt...
|
| 119 |
+
[2025-10-11 18:35:06][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000209715200.pt.
|
| 120 |
+
[2025-10-11 18:35:06][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.447]
|
| 121 |
+
[2025-10-11 18:35:41][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:13:25] [ETA: 1:48:38] [loss: 5.998] [tokens/s: 271889.561] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 122 |
+
[2025-10-11 18:36:16][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:14:00] [ETA: 1:42:42] [loss: 5.855] [tokens/s: 305368.658] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 123 |
+
[2025-10-11 18:36:16][train:194][INFO] Running validation...
|
| 124 |
+
[2025-10-11 18:37:40][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 840.405] [val/train_update_time: 419.145] [val/loss: 5.873] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.901] [val/val_tokens_per_second: 488192.483] [val/loss_avg_len_2048: 5.873] [val/perplexity_len_2048: 355.404] [val/loss_avg_len_1024: 5.877] [val/perplexity_len_1024: 356.582] [val/loss_avg_len_512: 5.895] [val/perplexity_len_512: 363.294]
|
| 125 |
+
[2025-10-11 18:38:15][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:15:59] [ETA: 1:46:58] [loss: 5.793] [tokens/s: 271811.835] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 126 |
+
[2025-10-11 18:38:49][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:16:33] [ETA: 1:41:44] [loss: 5.713] [tokens/s: 305351.869] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 127 |
+
[2025-10-11 18:38:49][train:194][INFO] Running validation...
|
| 128 |
+
[2025-10-11 18:40:12][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 993.830] [val/train_update_time: 488.409] [val/loss: 5.695] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.151] [val/val_tokens_per_second: 492597.127] [val/loss_avg_len_2048: 5.695] [val/perplexity_len_2048: 297.478] [val/loss_avg_len_1024: 5.701] [val/perplexity_len_1024: 299.110] [val/loss_avg_len_512: 5.721] [val/perplexity_len_512: 305.264]
|
| 129 |
+
[2025-10-11 18:40:47][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:18:31] [ETA: 1:44:59] [loss: 5.581] [tokens/s: 272077.586] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 130 |
+
[2025-10-11 18:40:47][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1111.739] [train_eval/train_update_time: 523.030] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.841] [train_eval/perplexity_len_2048: 344.221] [train_eval/loss_avg_len_1024: 5.846] [train_eval/perplexity_len_1024: 345.882] [train_eval/loss_avg_len_512: 5.864] [train_eval/perplexity_len_512: 352.119]
|
| 131 |
+
[2025-10-11 18:41:22][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:19:06] [ETA: 1:40:19] [loss: 5.538] [tokens/s: 305715.175] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 132 |
+
[2025-10-11 18:41:22][train:194][INFO] Running validation...
|
| 133 |
+
[2025-10-11 18:42:45][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1146.476] [val/train_update_time: 557.623] [val/loss: 5.523] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.126] [val/val_tokens_per_second: 492748.069] [val/loss_avg_len_2048: 5.523] [val/perplexity_len_2048: 250.341] [val/loss_avg_len_1024: 5.531] [val/perplexity_len_1024: 252.272] [val/loss_avg_len_512: 5.553] [val/perplexity_len_512: 258.087]
|
| 134 |
+
[2025-10-11 18:43:20][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:21:04] [ETA: 1:42:53] [loss: 5.430] [tokens/s: 272369.407] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 135 |
+
[2025-10-11 18:43:55][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:21:39] [ETA: 1:38:38] [loss: 5.393] [tokens/s: 305993.804] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 136 |
+
[2025-10-11 18:43:55][train:194][INFO] Running validation...
|
| 137 |
+
[2025-10-11 18:45:18][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1299.095] [val/train_update_time: 626.843] [val/loss: 5.388] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.263] [val/val_tokens_per_second: 491935.403] [val/loss_avg_len_2048: 5.388] [val/perplexity_len_2048: 218.701] [val/loss_avg_len_1024: 5.396] [val/perplexity_len_1024: 220.525] [val/loss_avg_len_512: 5.420] [val/perplexity_len_512: 225.771]
|
| 138 |
+
[2025-10-11 18:45:53][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:23:37] [ETA: 1:40:42] [loss: 5.366] [tokens/s: 272483.112] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 139 |
+
[2025-10-11 18:46:27][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:24:12] [ETA: 1:36:48] [loss: 5.265] [tokens/s: 306369.458] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 140 |
+
[2025-10-11 18:46:27][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1452.039] [train_eval/train_update_time: 696.240] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.426] [train_eval/perplexity_len_2048: 227.299] [train_eval/loss_avg_len_1024: 5.435] [train_eval/perplexity_len_1024: 229.249] [train_eval/loss_avg_len_512: 5.457] [train_eval/perplexity_len_512: 234.370]
|
| 141 |
+
[2025-10-11 18:46:27][train:194][INFO] Running validation...
|
| 142 |
+
[2025-10-11 18:47:51][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1452.039] [val/train_update_time: 696.240] [val/loss: 5.268] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.219] [val/val_tokens_per_second: 492196.721] [val/loss_avg_len_2048: 5.268] [val/perplexity_len_2048: 193.983] [val/loss_avg_len_1024: 5.278] [val/perplexity_len_1024: 195.923] [val/loss_avg_len_512: 5.303] [val/perplexity_len_512: 200.900]
|
| 143 |
+
[2025-10-11 18:47:51][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000419430400.pt...
|
| 144 |
+
[2025-10-11 18:47:51][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000419430400.pt.
|
| 145 |
+
[2025-10-11 18:47:51][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.436]
|
| 146 |
+
[2025-10-11 18:48:26][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:26:10] [ETA: 1:38:27] [loss: 5.229] [tokens/s: 272702.157] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 147 |
+
[2025-10-11 18:49:01][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:26:45] [ETA: 1:34:51] [loss: 5.191] [tokens/s: 306487.143] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 148 |
+
[2025-10-11 18:49:01][train:194][INFO] Running validation...
|
| 149 |
+
[2025-10-11 18:50:24][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 1605.222] [val/train_update_time: 765.490] [val/loss: 5.169] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.114] [val/val_tokens_per_second: 492816.400] [val/loss_avg_len_2048: 5.169] [val/perplexity_len_2048: 175.706] [val/loss_avg_len_1024: 5.179] [val/perplexity_len_1024: 177.538] [val/loss_avg_len_512: 5.205] [val/perplexity_len_512: 182.153]
|
| 150 |
+
[2025-10-11 18:50:59][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:28:43] [ETA: 1:36:08] [loss: 5.120] [tokens/s: 272999.729] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 151 |
+
[2025-10-11 18:51:33][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:29:17] [ETA: 1:32:46] [loss: 5.060] [tokens/s: 306496.656] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 152 |
+
[2025-10-11 18:51:33][train:194][INFO] Running validation...
|
| 153 |
+
[2025-10-11 18:52:56][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 1757.860] [val/train_update_time: 834.736] [val/loss: 5.079] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.139] [val/val_tokens_per_second: 492668.686] [val/loss_avg_len_2048: 5.079] [val/perplexity_len_2048: 160.642] [val/loss_avg_len_1024: 5.090] [val/perplexity_len_1024: 162.465] [val/loss_avg_len_512: 5.117] [val/perplexity_len_512: 166.831]
|
| 154 |
+
[2025-10-11 18:53:31][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:31:15] [ETA: 1:33:47] [loss: 5.056] [tokens/s: 272986.995] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 155 |
+
[2025-10-11 18:53:31][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1875.768] [train_eval/train_update_time: 869.364] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.151] [train_eval/perplexity_len_2048: 172.569] [train_eval/loss_avg_len_1024: 5.160] [train_eval/perplexity_len_1024: 174.145] [train_eval/loss_avg_len_512: 5.184] [train_eval/perplexity_len_512: 178.345]
|
| 156 |
+
[2025-10-11 18:54:06][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:31:50] [ETA: 1:30:37] [loss: 5.006] [tokens/s: 306481.407] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 157 |
+
[2025-10-11 18:54:06][train:194][INFO] Running validation...
|
| 158 |
+
[2025-10-11 18:55:29][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 1910.516] [val/train_update_time: 903.970] [val/loss: 5.007] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.178] [val/val_tokens_per_second: 492436.321] [val/loss_avg_len_2048: 5.007] [val/perplexity_len_2048: 149.392] [val/loss_avg_len_1024: 5.019] [val/perplexity_len_1024: 151.218] [val/loss_avg_len_512: 5.046] [val/perplexity_len_512: 155.350]
|
| 159 |
+
[2025-10-11 18:56:04][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:33:48] [ETA: 1:31:24] [loss: 4.980] [tokens/s: 272966.915] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 160 |
+
[2025-10-11 18:56:39][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:34:23] [ETA: 1:28:25] [loss: 4.937] [tokens/s: 306516.134] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 161 |
+
[2025-10-11 18:56:39][train:194][INFO] Running validation...
|
| 162 |
+
[2025-10-11 18:58:02][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 2063.196] [val/train_update_time: 973.209] [val/loss: 4.939] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.110] [val/val_tokens_per_second: 492840.705] [val/loss_avg_len_2048: 4.939] [val/perplexity_len_2048: 139.689] [val/loss_avg_len_1024: 4.952] [val/perplexity_len_1024: 141.455] [val/loss_avg_len_512: 4.979] [val/perplexity_len_512: 145.340]
|
| 163 |
+
[2025-10-11 18:58:37][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:36:21] [ETA: 1:28:59] [loss: 4.895] [tokens/s: 273079.385] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 164 |
+
[2025-10-11 18:59:11][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:36:55] [ETA: 1:26:10] [loss: 4.873] [tokens/s: 306834.003] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 165 |
+
[2025-10-11 18:59:11][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2215.832] [train_eval/train_update_time: 1042.442] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.964] [train_eval/perplexity_len_2048: 143.168] [train_eval/loss_avg_len_1024: 4.973] [train_eval/perplexity_len_1024: 144.463] [train_eval/loss_avg_len_512: 4.997] [train_eval/perplexity_len_512: 148.005]
|
| 166 |
+
[2025-10-11 18:59:11][train:194][INFO] Running validation...
|
| 167 |
+
[2025-10-11 19:00:34][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 2215.832] [val/train_update_time: 1042.442] [val/loss: 4.881] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.080] [val/val_tokens_per_second: 493016.170] [val/loss_avg_len_2048: 4.881] [val/perplexity_len_2048: 131.756] [val/loss_avg_len_1024: 4.894] [val/perplexity_len_1024: 133.429] [val/loss_avg_len_512: 4.921] [val/perplexity_len_512: 137.133]
|
| 168 |
+
[2025-10-11 19:00:34][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000629145600.pt...
|
| 169 |
+
[2025-10-11 19:00:35][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000629145600.pt.
|
| 170 |
+
[2025-10-11 19:00:35][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.440]
|
| 171 |
+
[2025-10-11 19:01:10][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:38:54] [ETA: 1:26:35] [loss: 4.887] [tokens/s: 273119.395] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 172 |
+
[2025-10-11 19:01:44][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:39:28] [ETA: 1:23:53] [loss: 4.818] [tokens/s: 306647.732] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 173 |
+
[2025-10-11 19:01:44][train:194][INFO] Running validation...
|
| 174 |
+
[2025-10-11 19:03:07][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 2368.879] [val/train_update_time: 1111.683] [val/loss: 4.833] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.095] [val/val_tokens_per_second: 492927.703] [val/loss_avg_len_2048: 4.833] [val/perplexity_len_2048: 125.533] [val/loss_avg_len_1024: 4.846] [val/perplexity_len_1024: 127.187] [val/loss_avg_len_512: 4.873] [val/perplexity_len_512: 130.752]
|
| 175 |
+
[2025-10-11 19:03:42][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:41:26] [ETA: 1:24:08] [loss: 4.830] [tokens/s: 273124.311] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 176 |
+
[2025-10-11 19:04:17][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:42:01] [ETA: 1:21:34] [loss: 4.790] [tokens/s: 306668.831] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 177 |
+
[2025-10-11 19:04:17][train:194][INFO] Running validation...
|
| 178 |
+
[2025-10-11 19:05:40][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 2521.492] [val/train_update_time: 1180.932] [val/loss: 4.790] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.172] [val/val_tokens_per_second: 492473.163] [val/loss_avg_len_2048: 4.790] [val/perplexity_len_2048: 120.287] [val/loss_avg_len_1024: 4.803] [val/perplexity_len_1024: 121.891] [val/loss_avg_len_512: 4.831] [val/perplexity_len_512: 125.291]
|
| 179 |
+
[2025-10-11 19:06:15][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:43:59] [ETA: 1:21:41] [loss: 4.776] [tokens/s: 273125.433] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 180 |
+
[2025-10-11 19:06:15][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2639.418] [train_eval/train_update_time: 1215.554] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.824] [train_eval/perplexity_len_2048: 124.432] [train_eval/loss_avg_len_1024: 4.836] [train_eval/perplexity_len_1024: 125.926] [train_eval/loss_avg_len_512: 4.861] [train_eval/perplexity_len_512: 129.172]
|
| 181 |
+
[2025-10-11 19:06:50][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 0:44:34] [ETA: 1:19:14] [loss: 4.703] [tokens/s: 306680.372] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 182 |
+
[2025-10-11 19:06:50][train:194][INFO] Running validation...
|
| 183 |
+
[2025-10-11 19:08:13][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 2674.171] [val/train_update_time: 1250.169] [val/loss: 4.743] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.356] [val/val_tokens_per_second: 491384.324] [val/loss_avg_len_2048: 4.743] [val/perplexity_len_2048: 114.805] [val/loss_avg_len_1024: 4.757] [val/perplexity_len_1024: 116.411] [val/loss_avg_len_512: 4.785] [val/perplexity_len_512: 119.713]
|
| 184 |
+
[2025-10-11 19:08:48][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 0:46:32] [ETA: 1:19:14] [loss: 4.724] [tokens/s: 273051.613] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 185 |
+
[2025-10-11 19:09:22][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 0:47:07] [ETA: 1:16:52] [loss: 4.700] [tokens/s: 306572.237] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 186 |
+
[2025-10-11 19:09:22][train:194][INFO] Running validation...
|
| 187 |
+
[2025-10-11 19:10:46][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 2827.018] [val/train_update_time: 1319.377] [val/loss: 4.708] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.290] [val/val_tokens_per_second: 491776.322] [val/loss_avg_len_2048: 4.708] [val/perplexity_len_2048: 110.851] [val/loss_avg_len_1024: 4.722] [val/perplexity_len_1024: 112.405] [val/loss_avg_len_512: 4.750] [val/perplexity_len_512: 115.608]
|
| 188 |
+
[2025-10-11 19:11:21][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 0:49:05] [ETA: 1:16:46] [loss: 4.711] [tokens/s: 272980.584] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 189 |
+
[2025-10-11 19:11:55][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 0:49:39] [ETA: 1:14:29] [loss: 4.619] [tokens/s: 306684.637] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 190 |
+
[2025-10-11 19:11:55][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2979.820] [train_eval/train_update_time: 1388.590] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.722] [train_eval/perplexity_len_2048: 112.354] [train_eval/loss_avg_len_1024: 4.733] [train_eval/perplexity_len_1024: 113.626] [train_eval/loss_avg_len_512: 4.758] [train_eval/perplexity_len_512: 116.555]
|
| 191 |
+
[2025-10-11 19:11:55][train:194][INFO] Running validation...
|
| 192 |
+
[2025-10-11 19:13:19][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 2979.820] [val/train_update_time: 1388.590] [val/loss: 4.676] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.660] [val/val_tokens_per_second: 489599.103] [val/loss_avg_len_2048: 4.676] [val/perplexity_len_2048: 107.362] [val/loss_avg_len_1024: 4.691] [val/perplexity_len_1024: 108.916] [val/loss_avg_len_512: 4.719] [val/perplexity_len_512: 112.042]
|
| 193 |
+
[2025-10-11 19:13:19][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000838860800.pt...
|
| 194 |
+
[2025-10-11 19:13:19][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000838860800.pt.
|
| 195 |
+
[2025-10-11 19:13:19][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.445]
|
| 196 |
+
[2025-10-11 19:13:54][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 0:51:38] [ETA: 1:14:19] [loss: 4.639] [tokens/s: 272787.535] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 197 |
+
[2025-10-11 19:14:29][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 0:52:13] [ETA: 1:12:07] [loss: 4.639] [tokens/s: 306227.933] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 198 |
+
[2025-10-11 19:14:29][train:194][INFO] Running validation...
|
| 199 |
+
[2025-10-11 19:15:53][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 3133.448] [val/train_update_time: 1457.837] [val/loss: 4.645] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.890] [val/val_tokens_per_second: 488258.606] [val/loss_avg_len_2048: 4.645] [val/perplexity_len_2048: 104.074] [val/loss_avg_len_1024: 4.660] [val/perplexity_len_1024: 105.603] [val/loss_avg_len_512: 4.688] [val/perplexity_len_512: 108.651]
|
| 200 |
+
[2025-10-11 19:16:28][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 0:54:12] [ETA: 1:11:50] [loss: 4.631] [tokens/s: 272508.815] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 201 |
+
[2025-10-11 19:17:02][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 0:54:46] [ETA: 1:09:43] [loss: 4.657] [tokens/s: 305908.080] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 202 |
+
[2025-10-11 19:17:02][train:194][INFO] Running validation...
|
| 203 |
+
[2025-10-11 19:18:26][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 3286.849] [val/train_update_time: 1527.068] [val/loss: 4.622] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.674] [val/val_tokens_per_second: 489518.085] [val/loss_avg_len_2048: 4.622] [val/perplexity_len_2048: 101.723] [val/loss_avg_len_1024: 4.637] [val/perplexity_len_1024: 103.258] [val/loss_avg_len_512: 4.666] [val/perplexity_len_512: 106.264]
|
| 204 |
+
[2025-10-11 19:19:01][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 0:56:45] [ETA: 1:09:22] [loss: 4.604] [tokens/s: 272329.416] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 205 |
+
[2025-10-11 19:19:01][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3405.286] [train_eval/train_update_time: 1561.695] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.642] [train_eval/perplexity_len_2048: 103.737] [train_eval/loss_avg_len_1024: 4.656] [train_eval/perplexity_len_1024: 105.218] [train_eval/loss_avg_len_512: 4.684] [train_eval/perplexity_len_512: 108.214]
|
| 206 |
+
[2025-10-11 19:19:36][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 0:57:20] [ETA: 1:07:18] [loss: 4.574] [tokens/s: 305721.981] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 207 |
+
[2025-10-11 19:19:36][train:194][INFO] Running validation...
|
| 208 |
+
[2025-10-11 19:20:59][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 3440.122] [val/train_update_time: 1596.400] [val/loss: 4.595] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.088] [val/val_tokens_per_second: 492973.221] [val/loss_avg_len_2048: 4.595] [val/perplexity_len_2048: 98.979] [val/loss_avg_len_1024: 4.610] [val/perplexity_len_1024: 100.481] [val/loss_avg_len_512: 4.639] [val/perplexity_len_512: 103.400]
|
| 209 |
+
[2025-10-11 19:21:33][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 0:59:17] [ETA: 1:06:52] [loss: 4.603] [tokens/s: 272392.626] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 210 |
+
[2025-10-11 19:22:08][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 0:59:52] [ETA: 1:04:52] [loss: 4.579] [tokens/s: 305786.877] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 211 |
+
[2025-10-11 19:22:08][train:194][INFO] Running validation...
|
| 212 |
+
[2025-10-11 19:23:31][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 3592.762] [val/train_update_time: 1665.690] [val/loss: 4.576] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.213] [val/val_tokens_per_second: 492228.525] [val/loss_avg_len_2048: 4.576] [val/perplexity_len_2048: 97.115] [val/loss_avg_len_1024: 4.591] [val/perplexity_len_1024: 98.559] [val/loss_avg_len_512: 4.619] [val/perplexity_len_512: 101.419]
|
| 213 |
+
[2025-10-11 19:24:06][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:01:50] [ETA: 1:04:22] [loss: 4.574] [tokens/s: 272415.311] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 214 |
+
[2025-10-11 19:24:41][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:02:25] [ETA: 1:02:25] [loss: 4.552] [tokens/s: 306176.351] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 215 |
+
[2025-10-11 19:24:41][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3745.514] [train_eval/train_update_time: 1734.943] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.581] [train_eval/perplexity_len_2048: 97.609] [train_eval/loss_avg_len_1024: 4.594] [train_eval/perplexity_len_1024: 98.888] [train_eval/loss_avg_len_512: 4.622] [train_eval/perplexity_len_512: 101.721]
|
| 216 |
+
[2025-10-11 19:24:41][train:194][INFO] Running validation...
|
| 217 |
+
[2025-10-11 19:26:04][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 3745.514] [val/train_update_time: 1734.943] [val/loss: 4.557] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.290] [val/val_tokens_per_second: 491774.305] [val/loss_avg_len_2048: 4.557] [val/perplexity_len_2048: 95.273] [val/loss_avg_len_1024: 4.572] [val/perplexity_len_1024: 96.744] [val/loss_avg_len_512: 4.601] [val/perplexity_len_512: 99.596]
|
| 218 |
+
[2025-10-11 19:26:04][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001048576000.pt...
|
| 219 |
+
[2025-10-11 19:26:05][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001048576000.pt.
|
| 220 |
+
[2025-10-11 19:26:05][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.448]
|
| 221 |
+
[2025-10-11 19:26:39][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:04:24] [ETA: 1:01:52] [loss: 4.556] [tokens/s: 272531.454] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 222 |
+
[2025-10-11 19:27:14][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:04:58] [ETA: 0:59:58] [loss: 4.551] [tokens/s: 306267.878] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 223 |
+
[2025-10-11 19:27:14][train:194][INFO] Running validation...
|
| 224 |
+
[2025-10-11 19:28:37][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 3898.721] [val/train_update_time: 1804.150] [val/loss: 4.540] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.143] [val/val_tokens_per_second: 492646.918] [val/loss_avg_len_2048: 4.540] [val/perplexity_len_2048: 93.677] [val/loss_avg_len_1024: 4.555] [val/perplexity_len_1024: 95.132] [val/loss_avg_len_512: 4.584] [val/perplexity_len_512: 97.952]
|
| 225 |
+
[2025-10-11 19:29:12][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:06:56] [ETA: 0:59:21] [loss: 4.509] [tokens/s: 272811.607] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 226 |
+
[2025-10-11 19:29:47][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:07:31] [ETA: 0:57:31] [loss: 4.518] [tokens/s: 306522.025] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 227 |
+
[2025-10-11 19:29:47][train:194][INFO] Running validation...
|
| 228 |
+
[2025-10-11 19:31:10][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 4051.347] [val/train_update_time: 1873.369] [val/loss: 4.519] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.450] [val/val_tokens_per_second: 490830.500] [val/loss_avg_len_2048: 4.519] [val/perplexity_len_2048: 91.751] [val/loss_avg_len_1024: 4.535] [val/perplexity_len_1024: 93.194] [val/loss_avg_len_512: 4.564] [val/perplexity_len_512: 95.948]
|
| 229 |
+
[2025-10-11 19:31:45][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:09:29] [ETA: 0:56:51] [loss: 4.477] [tokens/s: 272931.742] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 230 |
+
[2025-10-11 19:31:45][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4169.555] [train_eval/train_update_time: 1907.991] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.528] [train_eval/perplexity_len_2048: 92.594] [train_eval/loss_avg_len_1024: 4.539] [train_eval/perplexity_len_1024: 93.629] [train_eval/loss_avg_len_512: 4.565] [train_eval/perplexity_len_512: 96.098]
|
| 231 |
+
[2025-10-11 19:32:20][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:10:04] [ETA: 0:55:03] [loss: 4.517] [tokens/s: 306408.239] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 232 |
+
[2025-10-11 19:32:20][train:194][INFO] Running validation...
|
| 233 |
+
[2025-10-11 19:33:44][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 4204.282] [val/train_update_time: 1942.595] [val/loss: 4.504] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.983] [val/val_tokens_per_second: 487718.620] [val/loss_avg_len_2048: 4.504] [val/perplexity_len_2048: 90.391] [val/loss_avg_len_1024: 4.520] [val/perplexity_len_1024: 91.834] [val/loss_avg_len_512: 4.549] [val/perplexity_len_512: 94.584]
|
| 234 |
+
[2025-10-11 19:34:18][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:12:03] [ETA: 0:54:21] [loss: 4.482] [tokens/s: 272619.612] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 235 |
+
[2025-10-11 19:34:53][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:12:37] [ETA: 0:52:35] [loss: 4.521] [tokens/s: 306087.029] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 236 |
+
[2025-10-11 19:34:53][train:194][INFO] Running validation...
|
| 237 |
+
[2025-10-11 19:36:17][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 4357.763] [val/train_update_time: 2011.835] [val/loss: 4.488] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.390] [val/val_tokens_per_second: 491188.527] [val/loss_avg_len_2048: 4.488] [val/perplexity_len_2048: 88.920] [val/loss_avg_len_1024: 4.503] [val/perplexity_len_1024: 90.322] [val/loss_avg_len_512: 4.533] [val/perplexity_len_512: 93.026]
|
| 238 |
+
[2025-10-11 19:36:51][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:14:35] [ETA: 0:51:50] [loss: 4.499] [tokens/s: 272585.017] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 239 |
+
[2025-10-11 19:37:26][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:15:10] [ETA: 0:50:07] [loss: 4.498] [tokens/s: 306265.451] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 240 |
+
[2025-10-11 19:37:26][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4510.644] [train_eval/train_update_time: 2081.064] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.490] [train_eval/perplexity_len_2048: 89.091] [train_eval/loss_avg_len_1024: 4.501] [train_eval/perplexity_len_1024: 90.065] [train_eval/loss_avg_len_512: 4.527] [train_eval/perplexity_len_512: 92.497]
|
| 241 |
+
[2025-10-11 19:37:26][train:194][INFO] Running validation...
|
| 242 |
+
[2025-10-11 19:38:50][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 4510.644] [val/train_update_time: 2081.064] [val/loss: 4.478] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.903] [val/val_tokens_per_second: 488180.556] [val/loss_avg_len_2048: 4.478] [val/perplexity_len_2048: 88.043] [val/loss_avg_len_1024: 4.494] [val/perplexity_len_1024: 89.441] [val/loss_avg_len_512: 4.523] [val/perplexity_len_512: 92.125]
|
| 243 |
+
[2025-10-11 19:38:50][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001258291200.pt...
|
| 244 |
+
[2025-10-11 19:38:50][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001258291200.pt.
|
| 245 |
+
[2025-10-11 19:38:50][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.445]
|
| 246 |
+
[2025-10-11 19:39:25][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:17:09] [ETA: 0:49:20] [loss: 4.480] [tokens/s: 272370.966] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 247 |
+
[2025-10-11 19:40:00][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:17:44] [ETA: 0:47:38] [loss: 4.459] [tokens/s: 305691.748] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 248 |
+
[2025-10-11 19:40:00][train:194][INFO] Running validation...
|
| 249 |
+
[2025-10-11 19:41:24][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 4664.526] [val/train_update_time: 2150.357] [val/loss: 4.465] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.582] [val/val_tokens_per_second: 490057.066] [val/loss_avg_len_2048: 4.465] [val/perplexity_len_2048: 86.891] [val/loss_avg_len_1024: 4.481] [val/perplexity_len_1024: 88.302] [val/loss_avg_len_512: 4.511] [val/perplexity_len_512: 90.981]
|
| 250 |
+
[2025-10-11 19:41:58][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:19:42] [ETA: 0:46:48] [loss: 4.471] [tokens/s: 272186.782] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 251 |
+
[2025-10-11 19:42:33][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:20:17] [ETA: 0:45:09] [loss: 4.470] [tokens/s: 305621.049] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 252 |
+
[2025-10-11 19:42:33][train:194][INFO] Running validation...
|
| 253 |
+
[2025-10-11 19:43:57][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 4817.616] [val/train_update_time: 2219.606] [val/loss: 4.453] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.616] [val/val_tokens_per_second: 489856.014] [val/loss_avg_len_2048: 4.453] [val/perplexity_len_2048: 85.883] [val/loss_avg_len_1024: 4.469] [val/perplexity_len_1024: 87.290] [val/loss_avg_len_512: 4.499] [val/perplexity_len_512: 89.953]
|
| 254 |
+
[2025-10-11 19:44:31][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:22:15] [ETA: 0:44:17] [loss: 4.449] [tokens/s: 272126.727] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 255 |
+
[2025-10-11 19:44:31][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4935.988] [train_eval/train_update_time: 2254.240] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.466] [train_eval/perplexity_len_2048: 86.967] [train_eval/loss_avg_len_1024: 4.481] [train_eval/perplexity_len_1024: 88.316] [train_eval/loss_avg_len_512: 4.508] [train_eval/perplexity_len_512: 90.750]
|
| 256 |
+
[2025-10-11 19:45:06][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:22:50] [ETA: 0:42:40] [loss: 4.462] [tokens/s: 305780.593] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 257 |
+
[2025-10-11 19:45:06][train:194][INFO] Running validation...
|
| 258 |
+
[2025-10-11 19:46:30][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 4970.728] [val/train_update_time: 2288.853] [val/loss: 4.444] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.694] [val/val_tokens_per_second: 489402.924] [val/loss_avg_len_2048: 4.444] [val/perplexity_len_2048: 85.101] [val/loss_avg_len_1024: 4.460] [val/perplexity_len_1024: 86.496] [val/loss_avg_len_512: 4.490] [val/perplexity_len_512: 89.153]
|
| 259 |
+
[2025-10-11 19:47:05][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:24:49] [ETA: 0:41:46] [loss: 4.417] [tokens/s: 272222.213] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 260 |
+
[2025-10-11 19:47:39][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:25:23] [ETA: 0:40:11] [loss: 4.435] [tokens/s: 305648.253] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 261 |
+
[2025-10-11 19:47:39][train:194][INFO] Running validation...
|
| 262 |
+
[2025-10-11 19:49:03][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 5123.910] [val/train_update_time: 2358.096] [val/loss: 4.435] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.679] [val/val_tokens_per_second: 489488.811] [val/loss_avg_len_2048: 4.435] [val/perplexity_len_2048: 84.344] [val/loss_avg_len_1024: 4.451] [val/perplexity_len_1024: 85.720] [val/loss_avg_len_512: 4.481] [val/perplexity_len_512: 88.349]
|
| 263 |
+
[2025-10-11 19:49:38][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:27:22] [ETA: 0:39:15] [loss: 4.441] [tokens/s: 272114.326] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 264 |
+
[2025-10-11 19:50:13][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:27:57] [ETA: 0:37:41] [loss: 4.434] [tokens/s: 305948.726] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 265 |
+
[2025-10-11 19:50:13][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5277.087] [train_eval/train_update_time: 2427.330] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.437] [train_eval/perplexity_len_2048: 84.548] [train_eval/loss_avg_len_1024: 4.452] [train_eval/perplexity_len_1024: 85.836] [train_eval/loss_avg_len_512: 4.482] [train_eval/perplexity_len_512: 88.394]
|
| 266 |
+
[2025-10-11 19:50:13][train:194][INFO] Running validation...
|
| 267 |
+
[2025-10-11 19:51:36][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 5277.087] [val/train_update_time: 2427.330] [val/loss: 4.427] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.180] [val/val_tokens_per_second: 492425.017] [val/loss_avg_len_2048: 4.427] [val/perplexity_len_2048: 83.655] [val/loss_avg_len_1024: 4.443] [val/perplexity_len_1024: 85.042] [val/loss_avg_len_512: 4.474] [val/perplexity_len_512: 87.673]
|
| 268 |
+
[2025-10-11 19:51:36][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001468006400.pt...
|
| 269 |
+
[2025-10-11 19:51:36][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001468006400.pt.
|
| 270 |
+
[2025-10-11 19:51:36][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.452]
|
| 271 |
+
[2025-10-11 19:52:11][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 1:29:55] [ETA: 0:36:43] [loss: 4.427] [tokens/s: 272383.330] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 272 |
+
[2025-10-11 19:52:46][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 1:30:30] [ETA: 0:35:11] [loss: 4.429] [tokens/s: 305929.221] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 273 |
+
[2025-10-11 19:52:46][train:194][INFO] Running validation...
|
| 274 |
+
[2025-10-11 19:54:09][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 5430.250] [val/train_update_time: 2496.591] [val/loss: 4.419] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.169] [val/val_tokens_per_second: 492490.783] [val/loss_avg_len_2048: 4.419] [val/perplexity_len_2048: 83.027] [val/loss_avg_len_1024: 4.436] [val/perplexity_len_1024: 84.412] [val/loss_avg_len_512: 4.466] [val/perplexity_len_512: 87.030]
|
| 275 |
+
[2025-10-11 19:54:44][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 1:32:28] [ETA: 0:34:12] [loss: 4.433] [tokens/s: 272528.625] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 276 |
+
[2025-10-11 19:55:18][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 1:33:02] [ETA: 0:32:41] [loss: 4.431] [tokens/s: 306129.846] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 277 |
+
[2025-10-11 19:55:18][train:194][INFO] Running validation...
|
| 278 |
+
[2025-10-11 19:56:42][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 5582.925] [val/train_update_time: 2565.848] [val/loss: 4.412] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.229] [val/val_tokens_per_second: 492134.178] [val/loss_avg_len_2048: 4.412] [val/perplexity_len_2048: 82.464] [val/loss_avg_len_1024: 4.429] [val/perplexity_len_1024: 83.846] [val/loss_avg_len_512: 4.460] [val/perplexity_len_512: 86.447]
|
| 279 |
+
[2025-10-11 19:57:16][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 1:35:00] [ETA: 0:31:40] [loss: 4.417] [tokens/s: 272656.772] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 280 |
+
[2025-10-11 19:57:16][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5700.940] [train_eval/train_update_time: 2600.504] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.418] [train_eval/perplexity_len_2048: 82.893] [train_eval/loss_avg_len_1024: 4.433] [train_eval/perplexity_len_1024: 84.167] [train_eval/loss_avg_len_512: 4.462] [train_eval/perplexity_len_512: 86.650]
|
| 281 |
+
[2025-10-11 19:57:51][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 1:35:35] [ETA: 0:30:11] [loss: 4.382] [tokens/s: 306316.778] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 282 |
+
[2025-10-11 19:57:51][train:194][INFO] Running validation...
|
| 283 |
+
[2025-10-11 19:59:14][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 5735.700] [val/train_update_time: 2635.129] [val/loss: 4.407] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.274] [val/val_tokens_per_second: 491871.471] [val/loss_avg_len_2048: 4.407] [val/perplexity_len_2048: 82.010] [val/loss_avg_len_1024: 4.423] [val/perplexity_len_1024: 83.385] [val/loss_avg_len_512: 4.454] [val/perplexity_len_512: 85.971]
|
| 284 |
+
[2025-10-11 19:59:49][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 1:37:33] [ETA: 0:29:08] [loss: 4.442] [tokens/s: 272801.241] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 285 |
+
[2025-10-11 20:00:24][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 1:38:08] [ETA: 0:27:40] [loss: 4.363] [tokens/s: 306495.358] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 286 |
+
[2025-10-11 20:00:24][train:194][INFO] Running validation...
|
| 287 |
+
[2025-10-11 20:01:48][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 5888.469] [val/train_update_time: 2704.364] [val/loss: 4.402] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.732] [val/val_tokens_per_second: 489182.490] [val/loss_avg_len_2048: 4.402] [val/perplexity_len_2048: 81.604] [val/loss_avg_len_1024: 4.419] [val/perplexity_len_1024: 82.980] [val/loss_avg_len_512: 4.449] [val/perplexity_len_512: 85.571]
|
| 288 |
+
[2025-10-11 20:02:22][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 1:40:06] [ETA: 0:26:36] [loss: 4.402] [tokens/s: 272781.991] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 289 |
+
[2025-10-11 20:02:57][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 1:40:41] [ETA: 0:25:10] [loss: 4.374] [tokens/s: 306462.590] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 290 |
+
[2025-10-11 20:02:57][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6041.680] [train_eval/train_update_time: 2773.588] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.403] [train_eval/perplexity_len_2048: 81.721] [train_eval/loss_avg_len_1024: 4.419] [train_eval/perplexity_len_1024: 83.053] [train_eval/loss_avg_len_512: 4.448] [train_eval/perplexity_len_512: 85.471]
|
| 291 |
+
[2025-10-11 20:02:57][train:194][INFO] Running validation...
|
| 292 |
+
[2025-10-11 20:04:20][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 6041.680] [val/train_update_time: 2773.588] [val/loss: 4.398] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.158] [val/val_tokens_per_second: 492554.852] [val/loss_avg_len_2048: 4.398] [val/perplexity_len_2048: 81.271] [val/loss_avg_len_1024: 4.414] [val/perplexity_len_1024: 82.636] [val/loss_avg_len_512: 4.445] [val/perplexity_len_512: 85.213]
|
| 293 |
+
[2025-10-11 20:04:20][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001677721600.pt...
|
| 294 |
+
[2025-10-11 20:04:21][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001677721600.pt.
|
| 295 |
+
[2025-10-11 20:04:21][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.451]
|
| 296 |
+
[2025-10-11 20:04:55][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 1:42:40] [ETA: 0:24:04] [loss: 4.353] [tokens/s: 272796.902] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 297 |
+
[2025-10-11 20:05:30][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 1:43:14] [ETA: 0:22:39] [loss: 4.367] [tokens/s: 306276.764] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 298 |
+
[2025-10-11 20:05:30][train:194][INFO] Running validation...
|
| 299 |
+
[2025-10-11 20:06:53][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 6194.783] [val/train_update_time: 2842.821] [val/loss: 4.394] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.194] [val/val_tokens_per_second: 492344.641] [val/loss_avg_len_2048: 4.394] [val/perplexity_len_2048: 80.953] [val/loss_avg_len_1024: 4.411] [val/perplexity_len_1024: 82.322] [val/loss_avg_len_512: 4.442] [val/perplexity_len_512: 84.903]
|
| 300 |
+
[2025-10-11 20:07:28][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 1:45:12] [ETA: 0:21:32] [loss: 4.391] [tokens/s: 272801.266] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 301 |
+
[2025-10-11 20:08:03][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 1:45:47] [ETA: 0:20:09] [loss: 4.364] [tokens/s: 306302.849] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 302 |
+
[2025-10-11 20:08:03][train:194][INFO] Running validation...
|
| 303 |
+
[2025-10-11 20:09:26][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 6347.454] [val/train_update_time: 2912.047] [val/loss: 4.391] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.356] [val/val_tokens_per_second: 491386.983] [val/loss_avg_len_2048: 4.391] [val/perplexity_len_2048: 80.689] [val/loss_avg_len_1024: 4.407] [val/perplexity_len_1024: 82.060] [val/loss_avg_len_512: 4.438] [val/perplexity_len_512: 84.631]
|
| 304 |
+
[2025-10-11 20:10:01][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 1:47:45] [ETA: 0:19:00] [loss: 4.427] [tokens/s: 272772.712] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 305 |
+
[2025-10-11 20:10:01][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6465.570] [train_eval/train_update_time: 2946.677] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.388] [train_eval/perplexity_len_2048: 80.469] [train_eval/loss_avg_len_1024: 4.399] [train_eval/perplexity_len_1024: 81.375] [train_eval/loss_avg_len_512: 4.428] [train_eval/perplexity_len_512: 83.760]
|
| 306 |
+
[2025-10-11 20:10:36][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 1:48:20] [ETA: 0:17:38] [loss: 4.397] [tokens/s: 306283.750] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 307 |
+
[2025-10-11 20:10:36][train:194][INFO] Running validation...
|
| 308 |
+
[2025-10-11 20:11:59][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 6500.322] [val/train_update_time: 2981.297] [val/loss: 4.388] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.424] [val/val_tokens_per_second: 490983.080] [val/loss_avg_len_2048: 4.388] [val/perplexity_len_2048: 80.493] [val/loss_avg_len_1024: 4.405] [val/perplexity_len_1024: 81.864] [val/loss_avg_len_512: 4.436] [val/perplexity_len_512: 84.436]
|
| 309 |
+
[2025-10-11 20:12:34][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 1:50:18] [ETA: 0:16:28] [loss: 4.349] [tokens/s: 272720.654] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 310 |
+
[2025-10-11 20:13:09][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 1:50:53] [ETA: 0:15:07] [loss: 4.385] [tokens/s: 306431.128] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 311 |
+
[2025-10-11 20:13:09][train:194][INFO] Running validation...
|
| 312 |
+
[2025-10-11 20:14:33][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 6653.225] [val/train_update_time: 3050.524] [val/loss: 4.386] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.962] [val/val_tokens_per_second: 487840.726] [val/loss_avg_len_2048: 4.386] [val/perplexity_len_2048: 80.322] [val/loss_avg_len_1024: 4.403] [val/perplexity_len_1024: 81.687] [val/loss_avg_len_512: 4.434] [val/perplexity_len_512: 84.250]
|
| 313 |
+
[2025-10-11 20:15:07][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 1:52:51] [ETA: 0:13:56] [loss: 4.421] [tokens/s: 272631.733] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 314 |
+
[2025-10-11 20:15:42][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 1:53:26] [ETA: 0:12:36] [loss: 4.349] [tokens/s: 306244.747] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 315 |
+
[2025-10-11 20:15:42][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6806.725] [train_eval/train_update_time: 3119.803] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.387] [train_eval/perplexity_len_2048: 80.438] [train_eval/loss_avg_len_1024: 4.402] [train_eval/perplexity_len_1024: 81.618] [train_eval/loss_avg_len_512: 4.431] [train_eval/perplexity_len_512: 84.016]
|
| 316 |
+
[2025-10-11 20:15:42][train:194][INFO] Running validation...
|
| 317 |
+
[2025-10-11 20:17:06][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 6806.725] [val/train_update_time: 3119.803] [val/loss: 4.384] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.919] [val/val_tokens_per_second: 488087.212] [val/loss_avg_len_2048: 4.384] [val/perplexity_len_2048: 80.191] [val/loss_avg_len_1024: 4.401] [val/perplexity_len_1024: 81.559] [val/loss_avg_len_512: 4.432] [val/perplexity_len_512: 84.126]
|
| 318 |
+
[2025-10-11 20:17:06][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001887436800.pt...
|
| 319 |
+
[2025-10-11 20:17:07][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001887436800.pt.
|
| 320 |
+
[2025-10-11 20:17:07][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.446]
|
| 321 |
+
[2025-10-11 20:17:41][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 1:55:25] [ETA: 0:11:24] [loss: 4.378] [tokens/s: 272346.234] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 322 |
+
[2025-10-11 20:18:16][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 1:56:00] [ETA: 0:10:05] [loss: 4.415] [tokens/s: 305712.940] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 323 |
+
[2025-10-11 20:18:16][train:194][INFO] Running validation...
|
| 324 |
+
[2025-10-11 20:19:40][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 6960.597] [val/train_update_time: 3189.048] [val/loss: 4.383] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.851] [val/val_tokens_per_second: 488483.649] [val/loss_avg_len_2048: 4.383] [val/perplexity_len_2048: 80.099] [val/loss_avg_len_1024: 4.400] [val/perplexity_len_1024: 81.460] [val/loss_avg_len_512: 4.431] [val/perplexity_len_512: 84.019]
|
| 325 |
+
[2025-10-11 20:20:15][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 1:57:59] [ETA: 0:08:52] [loss: 4.391] [tokens/s: 272110.105] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 326 |
+
[2025-10-11 20:20:49][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 1:58:33] [ETA: 0:07:34] [loss: 4.346] [tokens/s: 305483.227] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 327 |
+
[2025-10-11 20:20:49][train:194][INFO] Running validation...
|
| 328 |
+
[2025-10-11 20:22:13][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 7113.937] [val/train_update_time: 3258.286] [val/loss: 4.382] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.208] [val/val_tokens_per_second: 492257.510] [val/loss_avg_len_2048: 4.382] [val/perplexity_len_2048: 80.033] [val/loss_avg_len_1024: 4.399] [val/perplexity_len_1024: 81.396] [val/loss_avg_len_512: 4.430] [val/perplexity_len_512: 83.957]
|
| 329 |
+
[2025-10-11 20:22:47][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 2:00:31] [ETA: 0:06:20] [loss: 4.382] [tokens/s: 272162.913] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 330 |
+
[2025-10-11 20:22:47][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7231.903] [train_eval/train_update_time: 3292.912] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.379] [train_eval/perplexity_len_2048: 79.785] [train_eval/loss_avg_len_1024: 4.396] [train_eval/perplexity_len_1024: 81.118] [train_eval/loss_avg_len_512: 4.424] [train_eval/perplexity_len_512: 83.395]
|
| 331 |
+
[2025-10-11 20:23:22][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 2:01:06] [ETA: 0:05:02] [loss: 4.385] [tokens/s: 305573.019] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 332 |
+
[2025-10-11 20:23:22][train:194][INFO] Running validation...
|
| 333 |
+
[2025-10-11 20:24:46][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 7266.676] [val/train_update_time: 3327.552] [val/loss: 4.382] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.397] [val/val_tokens_per_second: 491146.265] [val/loss_avg_len_2048: 4.382] [val/perplexity_len_2048: 79.997] [val/loss_avg_len_1024: 4.399] [val/perplexity_len_1024: 81.359] [val/loss_avg_len_512: 4.430] [val/perplexity_len_512: 83.920]
|
| 334 |
+
[2025-10-11 20:25:20][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 2:03:04] [ETA: 0:03:48] [loss: 4.396] [tokens/s: 272172.238] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
|
| 335 |
+
[2025-10-11 20:25:55][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 2:03:39] [ETA: 0:02:31] [loss: 4.376] [tokens/s: 305840.910] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
|
| 336 |
+
[2025-10-11 20:25:55][train:194][INFO] Running validation...
|
| 337 |
+
[2025-10-11 20:27:19][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 7419.519] [val/train_update_time: 3396.798] [val/loss: 4.382] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 84.220] [val/val_tokens_per_second: 486346.384] [val/loss_avg_len_2048: 4.382] [val/perplexity_len_2048: 79.981] [val/loss_avg_len_1024: 4.399] [val/perplexity_len_1024: 81.344] [val/loss_avg_len_512: 4.430] [val/perplexity_len_512: 83.905]
|
| 338 |
+
[2025-10-11 20:27:19][train:854][INFO] Training finished with 2055208960 tokens!
|
metrics/jsonlines/checkpoint.jsonl
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
{"step": 209715200, "checkpoint/checkpoint_time": 0.
|
| 2 |
-
{"step": 419430400, "checkpoint/checkpoint_time": 0.
|
| 3 |
-
{"step": 629145600, "checkpoint/checkpoint_time": 0.
|
| 4 |
-
{"step": 838860800, "checkpoint/checkpoint_time": 0.
|
| 5 |
-
{"step": 1048576000, "checkpoint/checkpoint_time": 0.
|
| 6 |
-
{"step": 1258291200, "checkpoint/checkpoint_time": 0.
|
| 7 |
-
{"step": 1468006400, "checkpoint/checkpoint_time": 0.
|
| 8 |
-
{"step": 1677721600, "checkpoint/checkpoint_time": 0.
|
| 9 |
-
{"step": 1887436800, "checkpoint/checkpoint_time": 0.
|
| 10 |
-
{"step": 2097152000, "checkpoint/checkpoint_time": 0.5034480569884181}
|
|
|
|
| 1 |
+
{"step": 209715200, "checkpoint/checkpoint_time": 0.44675078699947335}
|
| 2 |
+
{"step": 419430400, "checkpoint/checkpoint_time": 0.4361007340194192}
|
| 3 |
+
{"step": 629145600, "checkpoint/checkpoint_time": 0.43971711499034427}
|
| 4 |
+
{"step": 838860800, "checkpoint/checkpoint_time": 0.44477320901933126}
|
| 5 |
+
{"step": 1048576000, "checkpoint/checkpoint_time": 0.4476856429828331}
|
| 6 |
+
{"step": 1258291200, "checkpoint/checkpoint_time": 0.4449735890084412}
|
| 7 |
+
{"step": 1468006400, "checkpoint/checkpoint_time": 0.45186209797975607}
|
| 8 |
+
{"step": 1677721600, "checkpoint/checkpoint_time": 0.45097879599779844}
|
| 9 |
+
{"step": 1887436800, "checkpoint/checkpoint_time": 0.44572298499406315}
|
|
|
metrics/jsonlines/norm.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/throughput.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/train.jsonl
CHANGED
|
@@ -1,100 +1,98 @@
|
|
| 1 |
-
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time":
|
| 2 |
-
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time":
|
| 3 |
-
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time":
|
| 4 |
-
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time":
|
| 5 |
-
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time":
|
| 6 |
-
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time":
|
| 7 |
-
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time":
|
| 8 |
-
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time":
|
| 9 |
-
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time":
|
| 10 |
-
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time":
|
| 11 |
-
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time":
|
| 12 |
-
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time":
|
| 13 |
-
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time":
|
| 14 |
-
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time":
|
| 15 |
-
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time":
|
| 16 |
-
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time":
|
| 17 |
-
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time":
|
| 18 |
-
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time":
|
| 19 |
-
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time":
|
| 20 |
-
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time":
|
| 21 |
-
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time":
|
| 22 |
-
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time":
|
| 23 |
-
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time":
|
| 24 |
-
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time":
|
| 25 |
-
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time":
|
| 26 |
-
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time":
|
| 27 |
-
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time":
|
| 28 |
-
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time":
|
| 29 |
-
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time":
|
| 30 |
-
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time":
|
| 31 |
-
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time":
|
| 32 |
-
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time":
|
| 33 |
-
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time":
|
| 34 |
-
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time":
|
| 35 |
-
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time":
|
| 36 |
-
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time":
|
| 37 |
-
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time":
|
| 38 |
-
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time":
|
| 39 |
-
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time":
|
| 40 |
-
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time":
|
| 41 |
-
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time":
|
| 42 |
-
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time":
|
| 43 |
-
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time":
|
| 44 |
-
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time":
|
| 45 |
-
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time":
|
| 46 |
-
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time":
|
| 47 |
-
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time":
|
| 48 |
-
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time":
|
| 49 |
-
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time":
|
| 50 |
-
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time":
|
| 51 |
-
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time":
|
| 52 |
-
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time":
|
| 53 |
-
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time":
|
| 54 |
-
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time":
|
| 55 |
-
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time":
|
| 56 |
-
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time":
|
| 57 |
-
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time":
|
| 58 |
-
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time":
|
| 59 |
-
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time":
|
| 60 |
-
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time":
|
| 61 |
-
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time":
|
| 62 |
-
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time":
|
| 63 |
-
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time":
|
| 64 |
-
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time":
|
| 65 |
-
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time":
|
| 66 |
-
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time":
|
| 67 |
-
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time":
|
| 68 |
-
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time":
|
| 69 |
-
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time":
|
| 70 |
-
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time":
|
| 71 |
-
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time":
|
| 72 |
-
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time":
|
| 73 |
-
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time":
|
| 74 |
-
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time":
|
| 75 |
-
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time":
|
| 76 |
-
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time":
|
| 77 |
-
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time":
|
| 78 |
-
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time":
|
| 79 |
-
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time":
|
| 80 |
-
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time":
|
| 81 |
-
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time":
|
| 82 |
-
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time":
|
| 83 |
-
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time":
|
| 84 |
-
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time":
|
| 85 |
-
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time":
|
| 86 |
-
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time":
|
| 87 |
-
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time":
|
| 88 |
-
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time":
|
| 89 |
-
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time":
|
| 90 |
-
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time":
|
| 91 |
-
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time":
|
| 92 |
-
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time":
|
| 93 |
-
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time":
|
| 94 |
-
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time":
|
| 95 |
-
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time":
|
| 96 |
-
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time":
|
| 97 |
-
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time":
|
| 98 |
-
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time":
|
| 99 |
-
{"step": 2076180480, "train/token_count": 2076180480, "train/batch_count": 990, "train/flop_count": 0, "train/total_time": 3777.7033449816518, "train/update_time": 3442.088609050028, "train/lr": 3.0458649045211895e-07, "train/loss": 4.342052936553955, "train/global_grad_norm": 0.22195476293563843}
|
| 100 |
-
{"step": 2097152000, "train/token_count": 2097152000, "train/batch_count": 1000, "train/flop_count": 0, "train/total_time": 3812.7206143867224, "train/update_time": 3476.8136685648933, "train/lr": 2.517497224463483e-09, "train/loss": 4.377318382263184, "train/global_grad_norm": 0.22657278180122375}
|
|
|
|
| 1 |
+
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 38.55579723298433, "train/update_time": 38.35399063504883, "train/lr": 0.0009000000000000001, "train/loss": 9.772618293762207, "train/global_grad_norm": 1.2446180582046509}
|
| 2 |
+
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 73.29413136799121, "train/update_time": 72.96587482298492, "train/lr": 0.0009997960964140947, "train/loss": 8.16772174835205, "train/global_grad_norm": 0.9763324856758118}
|
| 3 |
+
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 191.7932936270081, "train/update_time": 107.60015929004294, "train/lr": 0.0009990914580222257, "train/loss": 7.603027820587158, "train/global_grad_norm": 0.5199674367904663}
|
| 4 |
+
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 226.53613719300483, "train/update_time": 142.21563291802886, "train/lr": 0.0009978842768382998, "train/loss": 7.273867607116699, "train/global_grad_norm": 0.6734169721603394}
|
| 5 |
+
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 345.1667736689851, "train/update_time": 176.83618369704345, "train/lr": 0.0009961757683914405, "train/loss": 7.0204386711120605, "train/global_grad_norm": 0.7655511498451233}
|
| 6 |
+
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 379.9190621979942, "train/update_time": 211.44736236400786, "train/lr": 0.00099396765300483, "train/loss": 6.766915321350098, "train/global_grad_norm": 0.30172044038772583}
|
| 7 |
+
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 498.6147063600074, "train/update_time": 246.0684660130646, "train/lr": 0.0009912621540634887, "train/loss": 6.6051344871521, "train/global_grad_norm": 0.827389121055603}
|
| 8 |
+
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 533.3516867249855, "train/update_time": 280.6721391470637, "train/lr": 0.000988061995775515, "train/loss": 6.440863132476807, "train/global_grad_norm": 1.4310520887374878}
|
| 9 |
+
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 651.8656425169902, "train/update_time": 315.29572922710213, "train/lr": 0.0009843704004290394, "train/loss": 6.236248970031738, "train/global_grad_norm": 0.6141589283943176}
|
| 10 |
+
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 686.6137485890067, "train/update_time": 349.9022945231409, "train/lr": 0.0009801910851476522, "train/loss": 6.1219682693481445, "train/global_grad_norm": 1.1012554168701172}
|
| 11 |
+
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 805.6500484560092, "train/update_time": 384.52516269215266, "train/lr": 0.0009755282581475768, "train/loss": 5.997668266296387, "train/global_grad_norm": 0.7333498597145081}
|
| 12 |
+
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 840.405497522006, "train/update_time": 419.1452622152283, "train/lr": 0.0009703866145003512, "train/loss": 5.855091571807861, "train/global_grad_norm": 0.6099743247032166}
|
| 13 |
+
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 959.1070550300064, "train/update_time": 453.81502685521264, "train/lr": 0.0009647713314052896, "train/loss": 5.792905330657959, "train/global_grad_norm": 0.7265439629554749}
|
| 14 |
+
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 993.8301178629918, "train/update_time": 488.4094896201277, "train/lr": 0.0009586880629764817, "train/loss": 5.712881088256836, "train/global_grad_norm": 0.9564601182937622}
|
| 15 |
+
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1111.7393740309926, "train/update_time": 523.0301243920985, "train/lr": 0.0009521429345495787, "train/loss": 5.5812788009643555, "train/global_grad_norm": 0.6320960521697998}
|
| 16 |
+
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1146.4763450330065, "train/update_time": 557.6233584931178, "train/lr": 0.0009451425365140996, "train/loss": 5.537780284881592, "train/global_grad_norm": 1.0332707166671753}
|
| 17 |
+
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1264.3617091320048, "train/update_time": 592.2433790900977, "train/lr": 0.000937693917677468, "train/loss": 5.429712772369385, "train/global_grad_norm": 1.0364717245101929}
|
| 18 |
+
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1299.0948956199863, "train/update_time": 626.843196902104, "train/lr": 0.0009298045781674596, "train/loss": 5.393295764923096, "train/global_grad_norm": 0.912891685962677}
|
| 19 |
+
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1417.3040110229922, "train/update_time": 661.6429489921429, "train/lr": 0.0009214824618802108, "train/loss": 5.366145133972168, "train/global_grad_norm": 1.039188027381897}
|
| 20 |
+
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1452.0386730069877, "train/update_time": 696.2396770050982, "train/lr": 0.000912735948481387, "train/loss": 5.265153884887695, "train/global_grad_norm": 0.851448655128479}
|
| 21 |
+
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 1570.467384151998, "train/update_time": 730.8712040171376, "train/lr": 0.0009035738449685707, "train/loss": 5.228988170623779, "train/global_grad_norm": 1.3591398000717163}
|
| 22 |
+
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 1605.2216201199917, "train/update_time": 765.4900175441289, "train/lr": 0.0008940053768033609, "train/loss": 5.1907196044921875, "train/global_grad_norm": 1.3637722730636597}
|
| 23 |
+
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 1723.0966956730117, "train/update_time": 800.1077135011437, "train/lr": 0.0008840401786221159, "train/loss": 5.119633197784424, "train/global_grad_norm": 1.0941485166549683}
|
| 24 |
+
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 1757.860138426011, "train/update_time": 834.7357104731782, "train/lr": 0.0008736882845346905, "train/loss": 5.059532642364502, "train/global_grad_norm": 0.9115816950798035}
|
| 25 |
+
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 1875.7684185520047, "train/update_time": 869.3637212922331, "train/lr": 0.0008629601180209381, "train/loss": 5.0564866065979, "train/global_grad_norm": 0.9306532740592957}
|
| 26 |
+
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 1910.5157937250042, "train/update_time": 903.9701039092615, "train/lr": 0.0008518664814351503, "train/loss": 5.0055952072143555, "train/global_grad_norm": 1.2287901639938354}
|
| 27 |
+
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2028.4397037760064, "train/update_time": 938.5902288242651, "train/lr": 0.0008404185451290017, "train/loss": 4.980170249938965, "train/global_grad_norm": 0.8189780712127686}
|
| 28 |
+
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2063.1963952730002, "train/update_time": 973.2086714253237, "train/lr": 0.0008286278362039527, "train/loss": 4.936854362487793, "train/global_grad_norm": 1.3344489336013794}
|
| 29 |
+
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2181.072865936003, "train/update_time": 1007.8210269463598, "train/lr": 0.0008165062269044352, "train/loss": 4.8954973220825195, "train/global_grad_norm": 0.9711341261863708}
|
| 30 |
+
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 2215.8320906269946, "train/update_time": 1042.4422394274152, "train/lr": 0.0008040659226635089, "train/loss": 4.872994422912598, "train/global_grad_norm": 1.2242389917373657}
|
| 31 |
+
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 2334.1208405600046, "train/update_time": 1077.064158729394, "train/lr": 0.0007913194498130252, "train/loss": 4.887465476989746, "train/global_grad_norm": 1.0431791543960571}
|
| 32 |
+
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 2368.8792440719844, "train/update_time": 1111.6830330224184, "train/lr": 0.000778279642970672, "train/loss": 4.8178582191467285, "train/global_grad_norm": 0.9318476319313049}
|
| 33 |
+
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 2486.7425695779966, "train/update_time": 1146.306156033359, "train/lr": 0.0007649596321166025, "train/loss": 4.83030891418457, "train/global_grad_norm": 0.7958810329437256}
|
| 34 |
+
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 2521.492483378999, "train/update_time": 1180.931988580327, "train/lr": 0.0007513728293726579, "train/loss": 4.7904372215271, "train/global_grad_norm": 0.9025549292564392}
|
| 35 |
+
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 2639.4180885159876, "train/update_time": 1215.5536740703392, "train/lr": 0.0007375329154974975, "train/loss": 4.776428699493408, "train/global_grad_norm": 0.9148038625717163}
|
| 36 |
+
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 2674.1708125350124, "train/update_time": 1250.16888089836, "train/lr": 0.0007234538261112341, "train/loss": 4.702690601348877, "train/global_grad_norm": 0.863443911075592}
|
| 37 |
+
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 2792.2849394810037, "train/update_time": 1284.7876818493824, "train/lr": 0.0007091497376634464, "train/loss": 4.724478721618652, "train/global_grad_norm": 0.7980408668518066}
|
| 38 |
+
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 2827.0179851859866, "train/update_time": 1319.3769752274675, "train/lr": 0.0006946350531586958, "train/loss": 4.699609756469727, "train/global_grad_norm": 0.7278746962547302}
|
| 39 |
+
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 2945.1225905850006, "train/update_time": 1354.0406499354867, "train/lr": 0.0006799243876539214, "train/loss": 4.7111992835998535, "train/global_grad_norm": 1.1111260652542114}
|
| 40 |
+
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 2979.819751534, "train/update_time": 1388.5897308025742, "train/lr": 0.0006650325535423166, "train/loss": 4.619029521942139, "train/global_grad_norm": 0.6627530455589294}
|
| 41 |
+
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 3098.7063870650018, "train/update_time": 1423.2231051865965, "train/lr": 0.0006499745456385053, "train/loss": 4.639301300048828, "train/global_grad_norm": 0.7888188362121582}
|
| 42 |
+
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 3133.447895410005, "train/update_time": 1457.8368608065357, "train/lr": 0.0006347655260800339, "train/loss": 4.638882160186768, "train/global_grad_norm": 0.7990217804908752}
|
| 43 |
+
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 3252.0970968430047, "train/update_time": 1492.454597274569, "train/lr": 0.0006194208090603844, "train/loss": 4.6314873695373535, "train/global_grad_norm": 0.6108959913253784}
|
| 44 |
+
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 3286.8489470180066, "train/update_time": 1527.068307258567, "train/lr": 0.0006039558454088796, "train/loss": 4.656696319580078, "train/global_grad_norm": 0.9010829329490662}
|
| 45 |
+
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 3405.285965647985, "train/update_time": 1561.695067133638, "train/lr": 0.0005883862070330078, "train/loss": 4.604307651519775, "train/global_grad_norm": 0.7386929392814636}
|
| 46 |
+
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 3440.121589771996, "train/update_time": 1596.3996783556067, "train/lr": 0.0005727275712388317, "train/loss": 4.573764324188232, "train/global_grad_norm": 0.7288416624069214}
|
| 47 |
+
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 3557.970528264006, "train/update_time": 1631.0234597446106, "train/lr": 0.0005569957049452703, "train/loss": 4.603250026702881, "train/global_grad_norm": 0.7999058961868286}
|
| 48 |
+
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 3592.7624590400083, "train/update_time": 1665.6897427196673, "train/lr": 0.0005412064488081482, "train/loss": 4.578767776489258, "train/global_grad_norm": 0.8067259192466736}
|
| 49 |
+
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 3710.7506003380113, "train/update_time": 1700.3189989306557, "train/lr": 0.0005253757012699972, "train/loss": 4.573814868927002, "train/global_grad_norm": 0.7487918138504028}
|
| 50 |
+
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 3745.514067212993, "train/update_time": 1734.9429978527187, "train/lr": 0.0005095194025516734, "train/loss": 4.552258491516113, "train/global_grad_norm": 0.9750821590423584}
|
| 51 |
+
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 3864.004582177993, "train/update_time": 1769.5580677387188, "train/lr": 0.0004936535186019053, "train/loss": 4.5562920570373535, "train/global_grad_norm": 0.6866552233695984}
|
| 52 |
+
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 3898.7206719240057, "train/update_time": 1804.14956625973, "train/lr": 0.00047779402502093696, "train/loss": 4.551402568817139, "train/global_grad_norm": 0.787746250629425}
|
| 53 |
+
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 4016.6126302230114, "train/update_time": 1838.7692771856673, "train/lr": 0.0004619568909744525, "train/loss": 4.5086894035339355, "train/global_grad_norm": 0.8107492923736572}
|
| 54 |
+
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 4051.3472030170087, "train/update_time": 1873.3693814776198, "train/lr": 0.00044615806311398067, "train/loss": 4.518401622772217, "train/global_grad_norm": 0.5787344574928284}
|
| 55 |
+
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 4169.554661403992, "train/update_time": 1907.990586347587, "train/lr": 0.0004304134495199673, "train/loss": 4.477138042449951, "train/global_grad_norm": 0.7278950214385986}
|
| 56 |
+
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 4204.282005440997, "train/update_time": 1942.5947593615856, "train/lr": 0.0004147389036836882, "train/loss": 4.5172038078308105, "train/global_grad_norm": 1.0125855207443237}
|
| 57 |
+
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 4323.021760789008, "train/update_time": 1977.2172263615066, "train/lr": 0.0003991502085441259, "train/loss": 4.482260704040527, "train/global_grad_norm": 0.6572685241699219}
|
| 58 |
+
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 4357.762582869007, "train/update_time": 2011.834883902513, "train/lr": 0.0003836630605958888, "train/loss": 4.521332740783691, "train/global_grad_norm": 0.5961792469024658}
|
| 59 |
+
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 4475.897989705001, "train/update_time": 2046.4480303055025, "train/lr": 0.00036829305408417155, "train/loss": 4.499192714691162, "train/global_grad_norm": 0.6105663180351257}
|
| 60 |
+
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 4510.643669799989, "train/update_time": 2081.064080615528, "train/lr": 0.000353055665302672, "train/loss": 4.497852802276611, "train/global_grad_norm": 0.5793102979660034}
|
| 61 |
+
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 4629.746722721, "train/update_time": 2115.698471894517, "train/lr": 0.0003379662370102746, "train/loss": 4.479738235473633, "train/global_grad_norm": 0.5910903811454773}
|
| 62 |
+
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 4664.52624743199, "train/update_time": 2150.3571494565113, "train/lr": 0.00032303996298219405, "train/loss": 4.459190845489502, "train/global_grad_norm": 0.8015880584716797}
|
| 63 |
+
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 4782.869420437986, "train/update_time": 2184.9856218354835, "train/lr": 0.00030829187271113034, "train/loss": 4.47064733505249, "train/global_grad_norm": 0.5343803763389587}
|
| 64 |
+
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 4817.616289013007, "train/update_time": 2219.6062358425115, "train/lr": 0.0002937368162738445, "train/loss": 4.469577312469482, "train/global_grad_norm": 0.5645431280136108}
|
| 65 |
+
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 4935.988276930002, "train/update_time": 2254.239767934516, "train/lr": 0.0002793894493783894, "train/loss": 4.448817729949951, "train/global_grad_norm": 0.5171424150466919}
|
| 66 |
+
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 4970.727602478, "train/update_time": 2288.853339902591, "train/lr": 0.00026526421860705474, "train/loss": 4.462049961090088, "train/global_grad_norm": 0.5886797308921814}
|
| 67 |
+
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 5089.18058301101, "train/update_time": 2323.4854529426375, "train/lr": 0.0002513753468698824, "train/loss": 4.417477607727051, "train/global_grad_norm": 0.5379060506820679}
|
| 68 |
+
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 5123.909539049986, "train/update_time": 2358.095918665669, "train/lr": 0.00023773681908340283, "train/loss": 4.435017108917236, "train/global_grad_norm": 0.6538751125335693}
|
| 69 |
+
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 5242.36110436701, "train/update_time": 2392.718186916667, "train/lr": 0.00022436236808900823, "train/loss": 4.440553665161133, "train/global_grad_norm": 0.4718034267425537}
|
| 70 |
+
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 5277.0870255730115, "train/update_time": 2427.330438608711, "train/lr": 0.00021126546082514682, "train/loss": 4.433934688568115, "train/global_grad_norm": 0.49807751178741455}
|
| 71 |
+
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 5395.478063126997, "train/update_time": 2461.9552631227416, "train/lr": 0.00019845928476725522, "train/loss": 4.4274396896362305, "train/global_grad_norm": 0.4243120551109314}
|
| 72 |
+
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 5430.249512759008, "train/update_time": 2496.5911681566795, "train/lr": 0.0001859567346490913, "train/loss": 4.42873477935791, "train/global_grad_norm": 0.5035505890846252}
|
| 73 |
+
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 5548.174287694012, "train/update_time": 2531.217117117718, "train/lr": 0.00017377039947882782, "train/loss": 4.433116912841797, "train/global_grad_norm": 0.4461386501789093}
|
| 74 |
+
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 5582.925107155985, "train/update_time": 2565.8480685587565, "train/lr": 0.00016191254986299043, "train/loss": 4.430774211883545, "train/global_grad_norm": 0.4564478397369385}
|
| 75 |
+
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 5700.939995903987, "train/update_time": 2600.5040378217527, "train/lr": 0.00015039512565099468, "train/loss": 4.417412757873535, "train/global_grad_norm": 0.41517674922943115}
|
| 76 |
+
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 5735.700029758998, "train/update_time": 2635.1292617027066, "train/lr": 0.00013922972391273224, "train/loss": 4.381819248199463, "train/global_grad_norm": 0.42953088879585266}
|
| 77 |
+
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 5853.721849002992, "train/update_time": 2669.7465154657257, "train/lr": 0.00012842758726130281, "train/loss": 4.441737651824951, "train/global_grad_norm": 0.42887547612190247}
|
| 78 |
+
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 5888.469226193993, "train/update_time": 2704.3640278247476, "train/lr": 0.00011799959253265679, "train/loss": 4.363431453704834, "train/global_grad_norm": 0.3946261703968048}
|
| 79 |
+
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 6006.954968397011, "train/update_time": 2738.9892221787595, "train/lr": 0.00010795623983354214, "train/loss": 4.401818752288818, "train/global_grad_norm": 0.3415951728820801}
|
| 80 |
+
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 6041.680216562003, "train/update_time": 2773.58810844674, "train/lr": 9.830764196878872e-05, "train/loss": 4.37397575378418, "train/global_grad_norm": 0.397240549325943}
|
| 81 |
+
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 6160.038256887987, "train/update_time": 2808.195452109765, "train/lr": 8.906351425856951e-05, "train/loss": 4.353301048278809, "train/global_grad_norm": 0.42882615327835083}
|
| 82 |
+
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 6194.783387269999, "train/update_time": 2842.820589903771, "train/lr": 8.02331647558977e-05, "train/loss": 4.367002487182617, "train/global_grad_norm": 0.3661589026451111}
|
| 83 |
+
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 6312.718072921998, "train/update_time": 2877.4299484348157, "train/lr": 7.182548487420554e-05, "train/loss": 4.391452789306641, "train/global_grad_norm": 0.31681889295578003}
|
| 84 |
+
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 6347.4543838310055, "train/update_time": 2912.0468749527645, "train/lr": 6.384894043444556e-05, "train/loss": 4.364299774169922, "train/global_grad_norm": 0.31542593240737915}
|
| 85 |
+
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 6465.570202954987, "train/update_time": 2946.6770034248184, "train/lr": 5.6311563140726166e-05, "train/loss": 4.427130699157715, "train/global_grad_norm": 0.3284689784049988}
|
| 86 |
+
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 6500.321636478999, "train/update_time": 2981.296865779761, "train/lr": 4.922094249306547e-05, "train/loss": 4.397003650665283, "train/global_grad_norm": 0.3037455081939697}
|
| 87 |
+
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 6618.489081607986, "train/update_time": 3015.9109904506768, "train/lr": 4.2584218145409916e-05, "train/loss": 4.349207878112793, "train/global_grad_norm": 0.31362658739089966}
|
| 88 |
+
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 6653.224929338001, "train/update_time": 3050.5236358706316, "train/lr": 3.6408072716606236e-05, "train/loss": 4.384881019592285, "train/global_grad_norm": 0.2942019999027252}
|
| 89 |
+
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 6771.968496516987, "train/update_time": 3085.1793869955873, "train/lr": 3.069872506157217e-05, "train/loss": 4.42058801651001, "train/global_grad_norm": 0.2945297360420227}
|
| 90 |
+
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 6806.725144091994, "train/update_time": 3119.802880719566, "train/lr": 2.5461924009435368e-05, "train/loss": 4.3490190505981445, "train/global_grad_norm": 0.28907138109207153}
|
| 91 |
+
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 6925.848634602007, "train/update_time": 3154.4264403765555, "train/lr": 2.0702942574950812e-05, "train/loss": 4.378040313720703, "train/global_grad_norm": 0.2773731052875519}
|
| 92 |
+
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 6960.596640536998, "train/update_time": 3189.047608219611, "train/lr": 1.642657264902142e-05, "train/loss": 4.41549015045166, "train/global_grad_norm": 0.26614734530448914}
|
| 93 |
+
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 7079.1912472849945, "train/update_time": 3223.6655410806416, "train/lr": 1.2637120173670358e-05, "train/loss": 4.391122341156006, "train/global_grad_norm": 0.2462288737297058}
|
| 94 |
+
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 7113.936619379994, "train/update_time": 3258.2859199085797, "train/lr": 9.338400806321978e-06, "train/loss": 4.345850467681885, "train/global_grad_norm": 0.24034488201141357}
|
| 95 |
+
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 7231.90277238001, "train/update_time": 3292.912114331586, "train/lr": 6.533736077758867e-06, "train/loss": 4.3816142082214355, "train/global_grad_norm": 0.25654977560043335}
|
| 96 |
+
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 7266.675624558004, "train/update_time": 3327.5515688046, "train/lr": 4.2259500476214406e-06, "train/loss": 4.384517669677734, "train/global_grad_norm": 0.25040480494499207}
|
| 97 |
+
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 7384.794404487009, "train/update_time": 3362.1696882025863, "train/lr": 2.417366460819359e-06, "train/loss": 4.396021366119385, "train/global_grad_norm": 0.23735737800598145}
|
| 98 |
+
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 7419.518831352005, "train/update_time": 3396.797874707612, "train/lr": 1.1098064077174619e-06, "train/loss": 4.376190185546875, "train/global_grad_norm": 0.22627969086170197}
|
|
|
|
|
|
metrics/jsonlines/train_data_info.jsonl
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens":
|
|
|
|
| 1 |
+
{"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024}
|
metrics/jsonlines/train_eval.jsonl
CHANGED
|
@@ -1,20 +1,19 @@
|
|
| 1 |
-
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 2 |
-
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 3 |
-
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 4 |
-
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 5 |
-
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 6 |
-
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 7 |
-
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 8 |
-
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 9 |
-
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 10 |
-
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 11 |
-
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 12 |
-
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 13 |
-
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 14 |
-
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 15 |
-
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 16 |
-
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 17 |
-
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 18 |
-
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 19 |
-
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 20 |
-
{"step": 2097152000, "train_eval/train_token_count": 2097152000, "train_eval/train_batch_count": 1000, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3812.7206143867224, "train_eval/train_update_time": 3476.8136685648933, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.382946988567537, "train_eval/perplexity_len_2048": 80.07366220394087, "train_eval/loss_avg_len_1024": 4.400711219589484, "train_eval/perplexity_len_1024": 81.50881872350878, "train_eval/loss_avg_len_512": 4.429669063854925, "train_eval/perplexity_len_512": 83.90364556623013}
|
|
|
|
| 1 |
+
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 345.1667736689851, "train_eval/train_update_time": 176.83618369704345, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.311106187544976, "train_eval/perplexity_len_2048": 4068.8113636742964, "train_eval/loss_avg_len_1024": 8.310348202390452, "train_eval/perplexity_len_1024": 4065.7284336192884, "train_eval/loss_avg_len_512": 8.311592859712693, "train_eval/perplexity_len_512": 4070.7920228474427}
|
| 2 |
+
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 686.6137485890067, "train_eval/train_update_time": 349.9022945231409, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.5199917088173605, "train_eval/perplexity_len_2048": 678.5727591454385, "train_eval/loss_avg_len_1024": 6.520226818787997, "train_eval/perplexity_len_1024": 678.7323171230164, "train_eval/loss_avg_len_512": 6.528576857324952, "train_eval/perplexity_len_512": 684.4234857990103}
|
| 3 |
+
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1111.7393740309926, "train_eval/train_update_time": 523.0301243920985, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.841284574894999, "train_eval/perplexity_len_2048": 344.22123473765475, "train_eval/loss_avg_len_1024": 5.846098003384759, "train_eval/perplexity_len_1024": 345.8821130885547, "train_eval/loss_avg_len_512": 5.863969622435542, "train_eval/perplexity_len_512": 352.1191534491973}
|
| 4 |
+
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1452.0386730069877, "train_eval/train_update_time": 696.2396770050982, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.426266934516625, "train_eval/perplexity_len_2048": 227.29913709313033, "train_eval/loss_avg_len_1024": 5.4348094870870405, "train_eval/perplexity_len_1024": 229.24916918814984, "train_eval/loss_avg_len_512": 5.45689960326592, "train_eval/perplexity_len_512": 234.36965784398754}
|
| 5 |
+
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1875.7684185520047, "train_eval/train_update_time": 869.3637212922331, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.1507950319237175, "train_eval/perplexity_len_2048": 172.56863336563757, "train_eval/loss_avg_len_1024": 5.159889481119135, "train_eval/perplexity_len_1024": 174.1452082080109, "train_eval/loss_avg_len_512": 5.183720081899664, "train_eval/perplexity_len_512": 178.3450366054668}
|
| 6 |
+
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2215.8320906269946, "train_eval/train_update_time": 1042.4422394274152, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.964018276003353, "train_eval/perplexity_len_2048": 143.16792986712343, "train_eval/loss_avg_len_1024": 4.973020432846388, "train_eval/perplexity_len_1024": 144.46256855394853, "train_eval/loss_avg_len_512": 4.997246175480032, "train_eval/perplexity_len_512": 148.00501753900681}
|
| 7 |
+
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2639.4180885159876, "train_eval/train_update_time": 1215.5536740703392, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.823759766676467, "train_eval/perplexity_len_2048": 124.432047871971, "train_eval/loss_avg_len_1024": 4.835695269690004, "train_eval/perplexity_len_1024": 125.92610538184168, "train_eval/loss_avg_len_512": 4.86114800382129, "train_eval/perplexity_len_512": 129.17240743830268}
|
| 8 |
+
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2979.819751534, "train_eval/train_update_time": 1388.5897308025742, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.721651509671638, "train_eval/perplexity_len_2048": 112.35365267475348, "train_eval/loss_avg_len_1024": 4.73291551488037, "train_eval/perplexity_len_1024": 113.62635923386715, "train_eval/loss_avg_len_512": 4.758365783066256, "train_eval/perplexity_len_512": 116.55529353968126}
|
| 9 |
+
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3405.285965647985, "train_eval/train_update_time": 1561.695067133638, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.641861000917015, "train_eval/perplexity_len_2048": 103.73722312419866, "train_eval/loss_avg_len_1024": 4.656030186131794, "train_eval/perplexity_len_1024": 105.21755785416276, "train_eval/loss_avg_len_512": 4.684107663316537, "train_eval/perplexity_len_512": 108.2136661880101}
|
| 10 |
+
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3745.514067212993, "train_eval/train_update_time": 1734.9429978527187, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.580971560182552, "train_eval/perplexity_len_2048": 97.60918134794116, "train_eval/loss_avg_len_1024": 4.593989343002031, "train_eval/perplexity_len_1024": 98.88814303333918, "train_eval/loss_avg_len_512": 4.622232269574961, "train_eval/perplexity_len_512": 101.72084723154869}
|
| 11 |
+
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4169.554661403992, "train_eval/train_update_time": 1907.990586347587, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.528221095977534, "train_eval/perplexity_len_2048": 92.59369918490609, "train_eval/loss_avg_len_1024": 4.539339858165622, "train_eval/perplexity_len_1024": 93.628971312955, "train_eval/loss_avg_len_512": 4.565371130157291, "train_eval/perplexity_len_512": 96.09825235868954}
|
| 12 |
+
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4510.643669799989, "train_eval/train_update_time": 2081.064080615528, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.489654328562519, "train_eval/perplexity_len_2048": 89.09064446424271, "train_eval/loss_avg_len_1024": 4.500532126115468, "train_eval/perplexity_len_1024": 90.06504451373176, "train_eval/loss_avg_len_512": 4.527173429640897, "train_eval/perplexity_len_512": 92.49674268118629}
|
| 13 |
+
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4935.988276930002, "train_eval/train_update_time": 2254.239767934516, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.46553069598358, "train_eval/perplexity_len_2048": 86.96717042332567, "train_eval/loss_avg_len_1024": 4.480923175239441, "train_eval/perplexity_len_1024": 88.3161663514099, "train_eval/loss_avg_len_512": 4.508107499028265, "train_eval/perplexity_len_512": 90.74991159895295}
|
| 14 |
+
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5277.0870255730115, "train_eval/train_update_time": 2427.330438608711, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.4373238263875106, "train_eval/perplexity_len_2048": 84.54837251634272, "train_eval/loss_avg_len_1024": 4.452436186897358, "train_eval/perplexity_len_1024": 85.83580154594769, "train_eval/loss_avg_len_512": 4.48180135221679, "train_eval/perplexity_len_512": 88.39375763988248}
|
| 15 |
+
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5700.939995903987, "train_eval/train_update_time": 2600.5040378217527, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.417549327801252, "train_eval/perplexity_len_2048": 82.89289292787865, "train_eval/loss_avg_len_1024": 4.432807433758753, "train_eval/perplexity_len_1024": 84.1673798737353, "train_eval/loss_avg_len_512": 4.461872710547686, "train_eval/perplexity_len_512": 86.64962691935622}
|
| 16 |
+
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6041.680216562003, "train_eval/train_update_time": 2773.58810844674, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.4033073637809235, "train_eval/perplexity_len_2048": 81.72070229092803, "train_eval/loss_avg_len_1024": 4.419483624122986, "train_eval/perplexity_len_1024": 83.05338751777346, "train_eval/loss_avg_len_512": 4.448178964478576, "train_eval/perplexity_len_512": 85.4711561862065}
|
| 17 |
+
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6465.570202954987, "train_eval/train_update_time": 2946.6770034248184, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.387876043058186, "train_eval/perplexity_len_2048": 80.46932396633433, "train_eval/loss_avg_len_1024": 4.399068052637449, "train_eval/perplexity_len_1024": 81.37499610284682, "train_eval/loss_avg_len_512": 4.427956537717109, "train_eval/perplexity_len_512": 83.76008134396787}
|
| 18 |
+
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6806.725144091994, "train_eval/train_update_time": 3119.802880719566, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.387491640440776, "train_eval/perplexity_len_2048": 80.43839729210855, "train_eval/loss_avg_len_1024": 4.402047658253414, "train_eval/perplexity_len_1024": 81.61782308289827, "train_eval/loss_avg_len_512": 4.431004746926119, "train_eval/perplexity_len_512": 84.01578912267684}
|
| 19 |
+
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7231.90277238001, "train_eval/train_update_time": 3292.912114331586, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.379340490187296, "train_eval/perplexity_len_2048": 79.78539679771428, "train_eval/loss_avg_len_1024": 4.395910125486225, "train_eval/perplexity_len_1024": 81.11842512267636, "train_eval/loss_avg_len_512": 4.423583207665869, "train_eval/perplexity_len_512": 83.39457069504783}
|
|
|
metrics/jsonlines/val.jsonl
CHANGED
|
@@ -1,50 +1,49 @@
|
|
| 1 |
-
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time":
|
| 2 |
-
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time":
|
| 3 |
-
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time":
|
| 4 |
-
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time":
|
| 5 |
-
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time":
|
| 6 |
-
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time":
|
| 7 |
-
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time":
|
| 8 |
-
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time":
|
| 9 |
-
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time":
|
| 10 |
-
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time":
|
| 11 |
-
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time":
|
| 12 |
-
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time":
|
| 13 |
-
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time":
|
| 14 |
-
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time":
|
| 15 |
-
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time":
|
| 16 |
-
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time":
|
| 17 |
-
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time":
|
| 18 |
-
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time":
|
| 19 |
-
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time":
|
| 20 |
-
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time":
|
| 21 |
-
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time":
|
| 22 |
-
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time":
|
| 23 |
-
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time":
|
| 24 |
-
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time":
|
| 25 |
-
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time":
|
| 26 |
-
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time":
|
| 27 |
-
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time":
|
| 28 |
-
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time":
|
| 29 |
-
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time":
|
| 30 |
-
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time":
|
| 31 |
-
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time":
|
| 32 |
-
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time":
|
| 33 |
-
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time":
|
| 34 |
-
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time":
|
| 35 |
-
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time":
|
| 36 |
-
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time":
|
| 37 |
-
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time":
|
| 38 |
-
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time":
|
| 39 |
-
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time":
|
| 40 |
-
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time":
|
| 41 |
-
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time":
|
| 42 |
-
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time":
|
| 43 |
-
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time":
|
| 44 |
-
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time":
|
| 45 |
-
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time":
|
| 46 |
-
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time":
|
| 47 |
-
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time":
|
| 48 |
-
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time":
|
| 49 |
-
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time":
|
| 50 |
-
{"step": 2097152000, "val/train_token_count": 2097152000, "val/train_batch_count": 1000, "val/train_flop_count": 0, "val/train_total_time": 3812.7206143867224, "val/train_update_time": 3476.8136685648933, "val/loss": 4.37912868698081, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.211853195913136, "val/val_tokens_per_second": 329692.2730478253, "val/loss_avg_len_2048": 4.37912868698081, "val/perplexity_len_2048": 79.76849978432571, "val/loss_avg_len_1024": 4.394532114933245, "val/perplexity_len_1024": 81.00672005987042, "val/loss_avg_len_512": 4.422331890467555, "val/perplexity_len_512": 83.29028289665668}
|
|
|
|
| 1 |
+
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 73.29413136799121, "val/train_update_time": 72.96587482298492, "val/loss": 8.066625093784602, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.72365917899879, "val/val_tokens_per_second": 489228.4976750561, "val/loss_avg_len_2048": 8.066625093784602, "val/perplexity_len_2048": 3186.3300972596303, "val/loss_avg_len_1024": 8.063385806354136, "val/perplexity_len_1024": 3176.0253572442853, "val/loss_avg_len_512": 8.063155076403358, "val/perplexity_len_512": 3175.2926376033747}
|
| 2 |
+
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 226.53613719300483, "val/train_update_time": 142.21563291802886, "val/loss": 7.245292661319813, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.87901507099741, "val/val_tokens_per_second": 488322.3767628933, "val/loss_avg_len_2048": 7.245292661319813, "val/perplexity_len_2048": 1401.4919984598723, "val/loss_avg_len_1024": 7.242302788104816, "val/perplexity_len_1024": 1397.3079730422762, "val/loss_avg_len_512": 7.245893973641005, "val/perplexity_len_512": 1402.334986290682}
|
| 3 |
+
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 379.9190621979942, "val/train_update_time": 211.44736236400786, "val/loss": 6.768975664408669, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.94667539399234, "val/val_tokens_per_second": 487928.7929838769, "val/loss_avg_len_2048": 6.768975664408669, "val/perplexity_len_2048": 870.4198351684087, "val/loss_avg_len_1024": 6.765464797357005, "val/perplexity_len_1024": 867.369265054543, "val/loss_avg_len_512": 6.77120095622167, "val/perplexity_len_512": 872.3589300272525}
|
| 4 |
+
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 533.3516867249855, "val/train_update_time": 280.6721391470637, "val/loss": 6.397165876515117, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.74760113700177, "val/val_tokens_per_second": 489088.6359000778, "val/loss_avg_len_2048": 6.397165876515117, "val/perplexity_len_2048": 600.141749520851, "val/loss_avg_len_1024": 6.395533386002807, "val/perplexity_len_1024": 599.162823070194, "val/loss_avg_len_512": 6.406499122676719, "val/perplexity_len_512": 605.7692407307555}
|
| 5 |
+
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 686.6137485890067, "val/train_update_time": 349.9022945231409, "val/loss": 6.105244160742686, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.8132187850133, "val/val_tokens_per_second": 488705.726778794, "val/loss_avg_len_2048": 6.105244160742686, "val/perplexity_len_2048": 448.2020614450007, "val/loss_avg_len_1024": 6.106694530989509, "val/perplexity_len_1024": 448.85259202054465, "val/loss_avg_len_512": 6.122553787205275, "val/perplexity_len_512": 456.0278066748302}
|
| 6 |
+
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 840.405497522006, "val/train_update_time": 419.1452622152283, "val/loss": 5.87325499422655, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.90133277099812, "val/val_tokens_per_second": 488192.48332797043, "val/loss_avg_len_2048": 5.87325499422655, "val/perplexity_len_2048": 355.4039373228419, "val/loss_avg_len_1024": 5.876565324747097, "val/perplexity_len_1024": 356.5823912835879, "val/loss_avg_len_512": 5.895211068405025, "val/perplexity_len_512": 363.2935075883324}
|
| 7 |
+
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 993.8301178629918, "val/train_update_time": 488.4094896201277, "val/loss": 5.695340925325057, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.15111422500922, "val/val_tokens_per_second": 492597.1273116209, "val/loss_avg_len_2048": 5.695340925325057, "val/perplexity_len_2048": 297.4781941501539, "val/loss_avg_len_1024": 5.700811359035084, "val/perplexity_len_1024": 299.10998813246323, "val/loss_avg_len_512": 5.721176919134427, "val/perplexity_len_512": 305.26398264558094}
|
| 8 |
+
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1146.4763450330065, "val/train_update_time": 557.6233584931178, "val/loss": 5.5228234780823815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.12564284997643, "val/val_tokens_per_second": 492748.0690155242, "val/loss_avg_len_2048": 5.5228234780823815, "val/perplexity_len_2048": 250.34087223176758, "val/loss_avg_len_1024": 5.5305087140662135, "val/perplexity_len_1024": 252.27221280945514, "val/loss_avg_len_512": 5.553297413158045, "val/perplexity_len_512": 258.08717440915774}
|
| 9 |
+
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1299.0948956199863, "val/train_update_time": 626.843196902104, "val/loss": 5.387703346484853, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.26296444798936, "val/val_tokens_per_second": 491935.4033519414, "val/loss_avg_len_2048": 5.387703346484853, "val/perplexity_len_2048": 218.70052899266585, "val/loss_avg_len_1024": 5.396013065259485, "val/perplexity_len_1024": 220.52544063467434, "val/loss_avg_len_512": 5.419521161240525, "val/perplexity_len_512": 225.77098871461365}
|
| 10 |
+
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1452.0386730069877, "val/train_update_time": 696.2396770050982, "val/loss": 5.267773023286882, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.21875834499951, "val/val_tokens_per_second": 492196.7212030774, "val/loss_avg_len_2048": 5.267773023286882, "val/perplexity_len_2048": 193.9834843624152, "val/loss_avg_len_1024": 5.2777240617804235, "val/perplexity_len_1024": 195.92345784933923, "val/loss_avg_len_512": 5.30280932833273, "val/perplexity_len_512": 200.9004131536537}
|
| 11 |
+
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 1605.2216201199917, "val/train_update_time": 765.4900175441289, "val/loss": 5.168812291965983, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.11411714999122, "val/val_tokens_per_second": 492816.3999634607, "val/loss_avg_len_2048": 5.168812291965983, "val/perplexity_len_2048": 175.70602604634044, "val/loss_avg_len_1024": 5.179184607096157, "val/perplexity_len_1024": 177.53798874168763, "val/loss_avg_len_512": 5.204846120559331, "val/perplexity_len_512": 182.15284103478766}
|
| 12 |
+
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 1757.860138426011, "val/train_update_time": 834.7357104731782, "val/loss": 5.07917964521309, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.13903681200463, "val/val_tokens_per_second": 492668.6857416863, "val/loss_avg_len_2048": 5.07917964521309, "val/perplexity_len_2048": 160.64221824644585, "val/loss_avg_len_1024": 5.090464194297372, "val/perplexity_len_1024": 162.46525999969987, "val/loss_avg_len_512": 5.1169825646744105, "val/perplexity_len_512": 166.8312069871893}
|
| 13 |
+
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 1910.5157937250042, "val/train_update_time": 903.9701039092615, "val/loss": 5.006576894561109, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.17826742198667, "val/val_tokens_per_second": 492436.32104283245, "val/loss_avg_len_2048": 5.006576894561109, "val/perplexity_len_2048": 149.3924736958235, "val/loss_avg_len_1024": 5.018725027570501, "val/perplexity_len_1024": 151.21838156759458, "val/loss_avg_len_512": 5.045682267305069, "val/perplexity_len_512": 155.3502534484518}
|
| 14 |
+
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2063.1963952730002, "val/train_update_time": 973.2086714253237, "val/loss": 4.939421573723574, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.11001823598053, "val/val_tokens_per_second": 492840.70524084335, "val/loss_avg_len_2048": 4.939421573723574, "val/perplexity_len_2048": 139.6894261524396, "val/loss_avg_len_1024": 4.951981827717834, "val/perplexity_len_1024": 141.45502580357572, "val/loss_avg_len_512": 4.979078589006793, "val/perplexity_len_512": 145.3404016939073}
|
| 15 |
+
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 2215.8320906269946, "val/train_update_time": 1042.4422394274152, "val/loss": 4.880953582810099, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.08043931302382, "val/val_tokens_per_second": 493016.17009599815, "val/loss_avg_len_2048": 4.880953582810099, "val/perplexity_len_2048": 131.75624449043048, "val/loss_avg_len_1024": 4.8935686632201545, "val/perplexity_len_1024": 133.42888819459714, "val/loss_avg_len_512": 4.920950583540276, "val/perplexity_len_512": 137.13290753355847}
|
| 16 |
+
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 2368.8792440719844, "val/train_update_time": 1111.6830330224184, "val/loss": 4.8325701735513285, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.09534997900482, "val/val_tokens_per_second": 492927.70305858395, "val/loss_avg_len_2048": 4.8325701735513285, "val/perplexity_len_2048": 125.53318846749659, "val/loss_avg_len_1024": 4.8456610950137025, "val/perplexity_len_1024": 127.18733712083956, "val/loss_avg_len_512": 4.873299267485551, "val/perplexity_len_512": 130.75159052334683}
|
| 17 |
+
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 2521.492483378999, "val/train_update_time": 1180.931988580327, "val/loss": 4.789877079297254, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.1720448200067, "val/val_tokens_per_second": 492473.1631720955, "val/loss_avg_len_2048": 4.789877079297254, "val/perplexity_len_2048": 120.28658204324753, "val/loss_avg_len_1024": 4.803124652416493, "val/perplexity_len_1024": 121.89068913738296, "val/loss_avg_len_512": 4.830635423418787, "val/perplexity_len_512": 125.29054791458057}
|
| 18 |
+
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 2674.1708125350124, "val/train_update_time": 1250.16888089836, "val/loss": 4.743234224908077, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.35634245700203, "val/val_tokens_per_second": 491384.3241278074, "val/loss_avg_len_2048": 4.743234224908077, "val/perplexity_len_2048": 114.80490677425529, "val/loss_avg_len_1024": 4.7571246879515705, "val/perplexity_len_1024": 116.41072706320881, "val/loss_avg_len_512": 4.7850962978590275, "val/perplexity_len_512": 119.71289062387633}
|
| 19 |
+
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 2827.0179851859866, "val/train_update_time": 1319.3769752274675, "val/loss": 4.708184829720389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.28989867499331, "val/val_tokens_per_second": 491776.3216381208, "val/loss_avg_len_2048": 4.708184829720389, "val/perplexity_len_2048": 110.85076416463089, "val/loss_avg_len_1024": 4.722111135172844, "val/perplexity_len_1024": 112.40530514816416, "val/loss_avg_len_512": 4.750208974112105, "val/perplexity_len_512": 115.6084411743891}
|
| 20 |
+
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 2979.819751534, "val/train_update_time": 1388.5897308025742, "val/loss": 4.67620419087892, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.66028405699763, "val/val_tokens_per_second": 489599.1026290804, "val/loss_avg_len_2048": 4.67620419087892, "val/perplexity_len_2048": 107.36177335014543, "val/loss_avg_len_1024": 4.690579603694846, "val/perplexity_len_1024": 108.91628980025381, "val/loss_avg_len_512": 4.718876949522924, "val/perplexity_len_512": 112.04235276734202}
|
| 21 |
+
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 3133.447895410005, "val/train_update_time": 1457.8368608065357, "val/loss": 4.64510413077313, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.88997037999798, "val/val_tokens_per_second": 488258.60605818214, "val/loss_avg_len_2048": 4.64510413077313, "val/perplexity_len_2048": 104.07420254835952, "val/loss_avg_len_1024": 4.659685420518555, "val/perplexity_len_1024": 105.60285643905125, "val/loss_avg_len_512": 4.688136402057856, "val/perplexity_len_512": 108.65051015156381}
|
| 22 |
+
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 3286.8489470180066, "val/train_update_time": 1527.068307258567, "val/loss": 4.622256983703119, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.67413021399989, "val/val_tokens_per_second": 489518.0851625608, "val/loss_avg_len_2048": 4.622256983703119, "val/perplexity_len_2048": 101.72336120466873, "val/loss_avg_len_1024": 4.637227634538524, "val/perplexity_len_1024": 103.25768236429546, "val/loss_avg_len_512": 4.66592828974668, "val/perplexity_len_512": 106.26418340294687}
|
| 23 |
+
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 3440.121589771996, "val/train_update_time": 1596.3996783556067, "val/loss": 4.594907666088967, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.08767745201476, "val/val_tokens_per_second": 492973.2212536021, "val/loss_avg_len_2048": 4.594907666088967, "val/perplexity_len_2048": 98.97899600791816, "val/loss_avg_len_1024": 4.609972692025034, "val/perplexity_len_1024": 100.48140565521302, "val/loss_avg_len_512": 4.6386030704602605, "val/perplexity_len_512": 103.39980440729035}
|
| 24 |
+
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 3592.7624590400083, "val/train_update_time": 1665.6897427196673, "val/loss": 4.575895742433099, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.21338147099596, "val/val_tokens_per_second": 492228.52473885607, "val/loss_avg_len_2048": 4.575895742433099, "val/perplexity_len_2048": 97.1149902058559, "val/loss_avg_len_1024": 4.590654579546535, "val/perplexity_len_1024": 98.55892370715293, "val/loss_avg_len_512": 4.619258114893082, "val/perplexity_len_512": 101.41876314259017}
|
| 25 |
+
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 3745.514067212993, "val/train_update_time": 1734.9429978527187, "val/loss": 4.556744415998249, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.29024019200006, "val/val_tokens_per_second": 491774.30519565445, "val/loss_avg_len_2048": 4.556744415998249, "val/perplexity_len_2048": 95.27280576944895, "val/loss_avg_len_1024": 4.572066484152572, "val/perplexity_len_1024": 96.74382292667758, "val/loss_avg_len_512": 4.601123036310449, "val/perplexity_len_512": 99.59610289954561}
|
| 26 |
+
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 3898.7206719240057, "val/train_update_time": 1804.14956625973, "val/loss": 4.5398546514194225, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.14271029800875, "val/val_tokens_per_second": 492646.9182107115, "val/loss_avg_len_2048": 4.5398546514194225, "val/perplexity_len_2048": 93.67718328428124, "val/loss_avg_len_1024": 4.555266555744037, "val/perplexity_len_1024": 95.1321098665439, "val/loss_avg_len_512": 4.584480615995732, "val/perplexity_len_512": 97.95229907070892}
|
| 27 |
+
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 4051.3472030170087, "val/train_update_time": 1873.3693814776198, "val/loss": 4.519077338180738, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.45039679700858, "val/val_tokens_per_second": 490830.50017885934, "val/loss_avg_len_2048": 4.519077338180738, "val/perplexity_len_2048": 91.75090385627789, "val/loss_avg_len_1024": 4.534679728145711, "val/perplexity_len_1024": 93.19366321696087, "val/loss_avg_len_512": 4.56380435053194, "val/perplexity_len_512": 95.94780546420296}
|
| 28 |
+
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 4204.282005440997, "val/train_update_time": 1942.5947593615856, "val/loss": 4.504149057779786, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.98285058300826, "val/val_tokens_per_second": 487718.6201189412, "val/loss_avg_len_2048": 4.504149057779786, "val/perplexity_len_2048": 90.39139346022084, "val/loss_avg_len_1024": 4.519979343756846, "val/perplexity_len_1024": 91.83370101930686, "val/loss_avg_len_512": 4.549490065394714, "val/perplexity_len_512": 94.58416427684044}
|
| 29 |
+
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 4357.762582869007, "val/train_update_time": 2011.834883902513, "val/loss": 4.487737312592589, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.38956990098814, "val/val_tokens_per_second": 491188.52691809647, "val/loss_avg_len_2048": 4.487737312592589, "val/perplexity_len_2048": 88.9200198733193, "val/loss_avg_len_1024": 4.503381963831792, "val/perplexity_len_1024": 90.32208135719293, "val/loss_avg_len_512": 4.532881353686472, "val/perplexity_len_512": 93.02621672414543}
|
| 30 |
+
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 4510.643669799989, "val/train_update_time": 2081.064080615528, "val/loss": 4.477823902255832, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.90338259798591, "val/val_tokens_per_second": 488180.5563937209, "val/loss_avg_len_2048": 4.477823902255832, "val/perplexity_len_2048": 88.0428741653243, "val/loss_avg_len_1024": 4.493580668732151, "val/perplexity_len_1024": 89.44113225722718, "val/loss_avg_len_512": 4.523143095735833, "val/perplexity_len_512": 92.12470015446381}
|
| 31 |
+
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 4664.52624743199, "val/train_update_time": 2150.3571494565113, "val/loss": 4.464658180295001, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.58210265901289, "val/val_tokens_per_second": 490057.0660097311, "val/loss_avg_len_2048": 4.464658180295001, "val/perplexity_len_2048": 86.89132329645581, "val/loss_avg_len_1024": 4.480764351781691, "val/perplexity_len_1024": 88.30214078631853, "val/loss_avg_len_512": 4.510652843934484, "val/perplexity_len_512": 90.98119564814961}
|
| 32 |
+
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 4817.616289013007, "val/train_update_time": 2219.6062358425115, "val/loss": 4.4529901164986425, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.61640725701, "val/val_tokens_per_second": 489856.0144314992, "val/loss_avg_len_2048": 4.4529901164986425, "val/perplexity_len_2048": 85.88336170854886, "val/loss_avg_len_1024": 4.469235860132379, "val/perplexity_len_1024": 87.28999575640728, "val/loss_avg_len_512": 4.499290181878302, "val/perplexity_len_512": 89.95325818129203}
|
| 33 |
+
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 4970.727602478, "val/train_update_time": 2288.853339902591, "val/loss": 4.443838785698568, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.69381960498868, "val/val_tokens_per_second": 489402.9235769104, "val/loss_avg_len_2048": 4.443838785698568, "val/perplexity_len_2048": 85.10099994102198, "val/loss_avg_len_1024": 4.460096235682024, "val/perplexity_len_1024": 86.49583268126037, "val/loss_avg_len_512": 4.49035777461771, "val/perplexity_len_512": 89.15333697445978}
|
| 34 |
+
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 5123.909539049986, "val/train_update_time": 2358.095918665669, "val/loss": 4.434908325502579, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.67913438100368, "val/val_tokens_per_second": 489488.8110756854, "val/loss_avg_len_2048": 4.434908325502579, "val/perplexity_len_2048": 84.3443923040809, "val/loss_avg_len_1024": 4.451088077992527, "val/perplexity_len_1024": 85.72016350138246, "val/loss_avg_len_512": 4.481291017688159, "val/perplexity_len_512": 88.34865876197952}
|
| 35 |
+
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 5277.0870255730115, "val/train_update_time": 2427.330438608711, "val/loss": 4.426706585733639, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.18017689298722, "val/val_tokens_per_second": 492425.01675243815, "val/loss_avg_len_2048": 4.426706585733639, "val/perplexity_len_2048": 83.65545066943865, "val/loss_avg_len_1024": 4.443149887683336, "val/perplexity_len_1024": 85.04239422006852, "val/loss_avg_len_512": 4.473612986021303, "val/perplexity_len_512": 87.67291248266899}
|
| 36 |
+
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 5430.249512759008, "val/train_update_time": 2496.5911681566795, "val/loss": 4.419161863370403, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.169069185009, "val/val_tokens_per_second": 492490.78294822294, "val/loss_avg_len_2048": 4.419161863370403, "val/perplexity_len_2048": 83.02666849609794, "val/loss_avg_len_1024": 4.4357142435611685, "val/perplexity_len_1024": 84.41239437134591, "val/loss_avg_len_512": 4.466251606992445, "val/perplexity_len_512": 87.02988861829601}
|
| 37 |
+
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 5582.925107155985, "val/train_update_time": 2565.8480685587565, "val/loss": 4.412361895313021, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.22933418498724, "val/val_tokens_per_second": 492134.17842513864, "val/loss_avg_len_2048": 4.412361895313021, "val/perplexity_len_2048": 82.46400501736277, "val/loss_avg_len_1024": 4.428987180554634, "val/perplexity_len_1024": 83.8464525731744, "val/loss_avg_len_512": 4.459533594012447, "val/perplexity_len_512": 86.44718020978645}
|
| 38 |
+
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 5735.700029758998, "val/train_update_time": 2635.1292617027066, "val/loss": 4.406846508690622, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.27378682699054, "val/val_tokens_per_second": 491871.4707318212, "val/loss_avg_len_2048": 4.406846508690622, "val/perplexity_len_2048": 82.0104361010047, "val/loss_avg_len_1024": 4.423472210842371, "val/perplexity_len_1024": 83.38531467630756, "val/loss_avg_len_512": 4.45401080738604, "val/perplexity_len_512": 85.97106682551633}
|
| 39 |
+
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 5888.469226193993, "val/train_update_time": 2704.3640278247476, "val/loss": 4.401876839681971, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.7315334439918, "val/val_tokens_per_second": 489182.48974143335, "val/loss_avg_len_2048": 4.401876839681971, "val/perplexity_len_2048": 81.60388243364973, "val/loss_avg_len_1024": 4.418596094172914, "val/perplexity_len_1024": 82.97970785018886, "val/loss_avg_len_512": 4.449346278101858, "val/perplexity_len_512": 85.5709860862857}
|
| 40 |
+
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 6041.680216562003, "val/train_update_time": 2773.58810844674, "val/loss": 4.3977887128185715, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.1582510540029, "val/val_tokens_per_second": 492554.85151317826, "val/loss_avg_len_2048": 4.3977887128185715, "val/perplexity_len_2048": 81.27095639533198, "val/loss_avg_len_1024": 4.414443706510216, "val/perplexity_len_1024": 82.63585832737921, "val/loss_avg_len_512": 4.445159159805254, "val/perplexity_len_512": 85.21343931247905}
|
| 41 |
+
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 6194.783387269999, "val/train_update_time": 2842.820589903771, "val/loss": 4.393867793466989, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.19375617799233, "val/val_tokens_per_second": 492344.6407728776, "val/loss_avg_len_2048": 4.393867793466989, "val/perplexity_len_2048": 80.95292342793084, "val/loss_avg_len_1024": 4.410636982592754, "val/perplexity_len_1024": 82.32188441420018, "val/loss_avg_len_512": 4.441507733502612, "val/perplexity_len_512": 84.90285609977245}
|
| 42 |
+
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 6347.4543838310055, "val/train_update_time": 2912.0468749527645, "val/loss": 4.390602651420748, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.35589144201367, "val/val_tokens_per_second": 491386.9828684362, "val/loss_avg_len_2048": 4.390602651420748, "val/perplexity_len_2048": 80.68903169033332, "val/loss_avg_len_1024": 4.407449438422453, "val/perplexity_len_1024": 82.05989754063832, "val/loss_avg_len_512": 4.438302557166107, "val/perplexity_len_512": 84.6311631190514}
|
| 43 |
+
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 6500.321636478999, "val/train_update_time": 2981.296865779761, "val/loss": 4.388172285466153, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.42446340899915, "val/val_tokens_per_second": 490983.0800971214, "val/loss_avg_len_2048": 4.388172285466153, "val/perplexity_len_2048": 80.49316592395871, "val/loss_avg_len_1024": 4.405054118391453, "val/perplexity_len_1024": 81.8635730481807, "val/loss_avg_len_512": 4.43599369634632, "val/perplexity_len_512": 84.43598694640616}
|
| 44 |
+
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 6653.224929338001, "val/train_update_time": 3050.5236358706316, "val/loss": 4.386037354692467, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.96182978598517, "val/val_tokens_per_second": 487840.72601091646, "val/loss_avg_len_2048": 4.386037354692467, "val/perplexity_len_2048": 80.32150189756531, "val/loss_avg_len_1024": 4.402890470500942, "val/perplexity_len_1024": 81.68664057985016, "val/loss_avg_len_512": 4.433789289280586, "val/perplexity_len_512": 84.25006066407553}
|
| 45 |
+
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 6806.725144091994, "val/train_update_time": 3119.802880719566, "val/loss": 4.384412398562184, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.91942880197894, "val/val_tokens_per_second": 488087.2115639818, "val/loss_avg_len_2048": 4.384412398562184, "val/perplexity_len_2048": 80.1910889670049, "val/loss_avg_len_1024": 4.401321436325087, "val/perplexity_len_1024": 81.55857194737422, "val/loss_avg_len_512": 4.432314658900629, "val/perplexity_len_512": 84.12591452243974}
|
| 46 |
+
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 6960.596640536998, "val/train_update_time": 3189.047608219611, "val/loss": 4.383261151254853, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.85132250699098, "val/val_tokens_per_second": 488483.6490990946, "val/loss_avg_len_2048": 4.383261151254853, "val/perplexity_len_2048": 80.09882231281891, "val/loss_avg_len_1024": 4.400117161350698, "val/perplexity_len_1024": 81.46041211780742, "val/loss_avg_len_512": 4.431040566734597, "val/perplexity_len_512": 84.0187986060516}
|
| 47 |
+
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 7113.936619379994, "val/train_update_time": 3258.2859199085797, "val/loss": 4.382433253489226, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.20848171401303, "val/val_tokens_per_second": 492257.5097666033, "val/loss_avg_len_2048": 4.382433253489226, "val/perplexity_len_2048": 80.03253611967847, "val/loss_avg_len_1024": 4.399324055639003, "val/perplexity_len_1024": 81.39583101288655, "val/loss_avg_len_512": 4.430300414302852, "val/perplexity_len_512": 83.9566348960995}
|
| 48 |
+
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 7266.675624558004, "val/train_update_time": 3327.5515688046, "val/loss": 4.381986520714452, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.39674543499132, "val/val_tokens_per_second": 491146.2645976847, "val/loss_avg_len_2048": 4.381986520714452, "val/perplexity_len_2048": 79.99679094761, "val/loss_avg_len_1024": 4.398876096244017, "val/perplexity_len_1024": 81.35937715120615, "val/loss_avg_len_512": 4.429859576323722, "val/perplexity_len_512": 83.91963177962609}
|
| 49 |
+
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 7419.518831352005, "val/train_update_time": 3396.797874707612, "val/loss": 4.3817854977080835, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 84.2198099019879, "val/val_tokens_per_second": 486346.38391689357, "val/loss_avg_len_2048": 4.3817854977080835, "val/perplexity_len_2048": 79.98071136843066, "val/loss_avg_len_1024": 4.398686848510337, "val/perplexity_len_1024": 81.34398153030583, "val/loss_avg_len_512": 4.4296826414794666, "val/perplexity_len_512": 83.90478478616119}
|
|
|
metrics/npz/train_eval/step-000000104857600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a51d4164dc4390387d6c1f28c29dae3b0409dbb0029030a79fd2f136c3f09a9
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c26aa18e5615d75520cc77b9a6e6759e29f757fb91b9f6477fbcbf5b62f1a7b
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000314572800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:090974fb978109d91b3ef21c3340b1831246b4958cd4a39650e67232878f33ce
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c22e0ff82e135f1c1073aa676296905782a11112794e04b311fb947e963c80a6
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000524288000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b1740d1b697ea2ad755ac9078c7101d8bae35c85f72e1805bb7dcd3521193a0
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000629145600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:477545ebd381fa88a8679dcbae5f2ec198d76fd796eeb2ba0da1af34cf6745b1
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000734003200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e598ddac14dbc0bf035727e4fb9df3bbb920f4094dcb20593c9f3ed4b8e76b8
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000838860800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6404280af516e7df0d4be476e4cf7bda1360a9c5041a2d1337be8aec15e1c4d
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000943718400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a63e4c563cfbe723eb97b8cc7a222f56e7f9cde5da3ded66fc315b34474b3f6c
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001048576000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db2d799619ba478ffb8786553e755d9ecb3494bdd381f91242c3e61221490935
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001153433600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0977e97fa4947a5aaaa298256d5fa703442176c676c051c418ba57ca073f51a1
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001258291200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97eb10f5b7dfe6acc615229f78746d04cf64204f7fdd3931988f54d1306027bf
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001363148800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:570a43ae3248f83ba6e02b684a5818cb77228e3d4ef091614c44d811e21f4eb7
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001468006400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b53251c79c14aa7ef820f81660df31e266896dc496afd464579aa2b5cc8a8ad7
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001572864000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ede060d6102a48fdae1b6cfdd10408c952adcd49abeed642b89355fe2f5dc86
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001677721600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0960585fca32d99fe37c5872ed0a9b35642f905b3a5ebc6e66259c9672240f9
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001782579200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2edca23008bccebd5889eaa267684fc44029c93c3b03556af684ec08cbdeebb
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001887436800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dfe593ddd3f47a00cc4ef99beb62b100f2b2210361fe4a870d55ae10580758aa
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001992294400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b815f2619795c6ebcfafdf3be4ff18328df4ae04100471bacc90374f84c70d24
|
| 3 |
size 20540
|
metrics/npz/val/step-000000041943040.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:335edce24f2126c24a463d4572aeec86b71d9b561adcd09bc3441a009de956c0
|
| 3 |
size 21142
|
metrics/npz/val/step-000000083886080.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0dd23142abe5f7181034e5f085d2ca167c68a5bce942b91ec27f3b1700f8dae1
|
| 3 |
size 21142
|
metrics/npz/val/step-000000125829120.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17b5b83ca4e4583cfa0729d4e2064353a3950f7dca953ed87883f8e2435e2674
|
| 3 |
size 21142
|
metrics/npz/val/step-000000167772160.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0fe685bbece2ec360cbbad3e38efb200d7be826f8b520629ed5e2d9b7ae53fc9
|
| 3 |
size 21142
|
metrics/npz/val/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44e89cda40363cdecf06af5c26dca9a0c739c8b55501bf57d69399306d3a7ee5
|
| 3 |
size 21142
|
metrics/npz/val/step-000000251658240.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ee710b87975ced627a329473c3cd1eb5320c76ec9e93720ae98401078caf4b5
|
| 3 |
size 21142
|
metrics/npz/val/step-000000293601280.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3df282db83c775378e697e10a8c6399e8b8b34073def39031df797483681bc21
|
| 3 |
size 21142
|
metrics/npz/val/step-000000335544320.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36c64e84ddedb00766216c9b3cee5c66be139f2ead3493e6c1e97c88391fde65
|
| 3 |
size 21142
|
metrics/npz/val/step-000000377487360.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:926fa1d5c599f6e300195d68e2d8534512f27ca9bc643addef9693fe87bb4521
|
| 3 |
size 21142
|
metrics/npz/val/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:591381c3c1404825232d0b329093400130055d7a8db1bb43c0cdae6dcdde7bdf
|
| 3 |
size 21142
|
metrics/npz/val/step-000000461373440.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9782a956b1ee686d4cbbe9e0818204b1b7ca1247440c498e6c029b9ce6be2bd3
|
| 3 |
size 21142
|