Lanni-ni commited on
Commit
5238c35
·
verified ·
1 Parent(s): 6a486a0

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __pycache__/__init__.cpython-310.pyc +0 -0
  2. __pycache__/configuration_transformer.cpython-310.pyc +0 -0
  3. __pycache__/modeling_transformer.cpython-310.pyc +0 -0
  4. checkpoints/step-000000209715200.pt +1 -1
  5. checkpoints/step-000000419430400.pt +1 -1
  6. checkpoints/step-000000629145600.pt +1 -1
  7. checkpoints/step-000000838860800.pt +1 -1
  8. checkpoints/step-000001048576000.pt +1 -1
  9. checkpoints/step-000001258291200.pt +1 -1
  10. checkpoints/step-000001468006400.pt +1 -1
  11. checkpoints/step-000001677721600.pt +1 -1
  12. checkpoints/step-000001887436800.pt +1 -1
  13. logs/2025-10-11_18-22-13.log +338 -0
  14. metrics/jsonlines/checkpoint.jsonl +9 -10
  15. metrics/jsonlines/norm.jsonl +0 -0
  16. metrics/jsonlines/throughput.jsonl +0 -0
  17. metrics/jsonlines/train.jsonl +98 -100
  18. metrics/jsonlines/train_data_info.jsonl +1 -1
  19. metrics/jsonlines/train_eval.jsonl +19 -20
  20. metrics/jsonlines/val.jsonl +49 -50
  21. metrics/npz/train_eval/step-000000104857600.npz +1 -1
  22. metrics/npz/train_eval/step-000000209715200.npz +1 -1
  23. metrics/npz/train_eval/step-000000314572800.npz +1 -1
  24. metrics/npz/train_eval/step-000000419430400.npz +1 -1
  25. metrics/npz/train_eval/step-000000524288000.npz +1 -1
  26. metrics/npz/train_eval/step-000000629145600.npz +1 -1
  27. metrics/npz/train_eval/step-000000734003200.npz +1 -1
  28. metrics/npz/train_eval/step-000000838860800.npz +1 -1
  29. metrics/npz/train_eval/step-000000943718400.npz +1 -1
  30. metrics/npz/train_eval/step-000001048576000.npz +1 -1
  31. metrics/npz/train_eval/step-000001153433600.npz +1 -1
  32. metrics/npz/train_eval/step-000001258291200.npz +1 -1
  33. metrics/npz/train_eval/step-000001363148800.npz +1 -1
  34. metrics/npz/train_eval/step-000001468006400.npz +1 -1
  35. metrics/npz/train_eval/step-000001572864000.npz +1 -1
  36. metrics/npz/train_eval/step-000001677721600.npz +1 -1
  37. metrics/npz/train_eval/step-000001782579200.npz +1 -1
  38. metrics/npz/train_eval/step-000001887436800.npz +1 -1
  39. metrics/npz/train_eval/step-000001992294400.npz +1 -1
  40. metrics/npz/val/step-000000041943040.npz +1 -1
  41. metrics/npz/val/step-000000083886080.npz +1 -1
  42. metrics/npz/val/step-000000125829120.npz +1 -1
  43. metrics/npz/val/step-000000167772160.npz +1 -1
  44. metrics/npz/val/step-000000209715200.npz +1 -1
  45. metrics/npz/val/step-000000251658240.npz +1 -1
  46. metrics/npz/val/step-000000293601280.npz +1 -1
  47. metrics/npz/val/step-000000335544320.npz +1 -1
  48. metrics/npz/val/step-000000377487360.npz +1 -1
  49. metrics/npz/val/step-000000419430400.npz +1 -1
  50. metrics/npz/val/step-000000461373440.npz +1 -1
__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/__pycache__/__init__.cpython-310.pyc and b/__pycache__/__init__.cpython-310.pyc differ
 
__pycache__/configuration_transformer.cpython-310.pyc CHANGED
Binary files a/__pycache__/configuration_transformer.cpython-310.pyc and b/__pycache__/configuration_transformer.cpython-310.pyc differ
 
__pycache__/modeling_transformer.cpython-310.pyc CHANGED
Binary files a/__pycache__/modeling_transformer.cpython-310.pyc and b/__pycache__/modeling_transformer.cpython-310.pyc differ
 
checkpoints/step-000000209715200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9739812f9e9f23ba59c80b35b6247d42c68788e4d77f455439f4d1d5dd54c157
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea89f778ededec7472baf7a219f883e92333236c66d9d677d49fa81988b0f7e5
3
  size 329410498
checkpoints/step-000000419430400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:386b9f6fc01049fe38f2a2ac9887abbd35d555e2c1e7126e1d3cdfe78c3b5a1d
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef8110a29b497f69da13e35deda4a0d785a211d9d09c3a5bbfc331b305c2ec3f
3
  size 329410498
checkpoints/step-000000629145600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c42953cc0f47cb697e60769f612b9b99cbc549b150ecbcb50b84e559c6b980d
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22af72d4331025e64e17433768f06c9ef0b9ba2cdcd5d62a8aab9153821f5cef
3
  size 329410498
checkpoints/step-000000838860800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce365ae9e342ca83f9da6b6825721f2c2335500680697ac464557cdf1fe99918
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e9805446e6f290e9b32a69d915b404f3d389d50462e72da672d27fa6cfe2b4
3
  size 329410498
checkpoints/step-000001048576000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00b8c3e12f0cca094785472b02a58b0be08fae5ea6d19e896656c8f745f39fe6
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:106cebd91c20db5c04370b9101006b7e42d23ed67530eac965dc38d2d0069a2f
3
  size 329410498
checkpoints/step-000001258291200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b84259e0905fbb12e99f161c494fda36d5347c5a77852d6e20e30f8dfde28be1
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451c2d44639298ad3d2c9c793c1a90836aadb59256aed9533586b850631444ed
3
  size 329410498
checkpoints/step-000001468006400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49181888701780a51090129aca7a00963c74954e62fc8164858d13415bd8b067
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58e4060950655caa8e692850b5b49c660edf35ba56ce1740eb931be72b419547
3
  size 329410498
checkpoints/step-000001677721600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78a3491fec667eae864537db64a778ec6db0f185ae704e9441f563acaee69756
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bce41c761242437b5479b99e0b37de14a3e5e1d8e7e3fe68b68c9920e2214c1
3
  size 329410498
checkpoints/step-000001887436800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb0d66f9e8d983dab816428bbec79c157b9ea2bf2f3ab1c49e6e0647fb506e30
3
  size 329410498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b02ab5cbc5d45f4144e1f6fbe4a8b86e8616b9d02d30cfb1c7652aa28bca9aa
3
  size 329410498
logs/2025-10-11_18-22-13.log ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-11 18:22:13][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/transformer_2_4_256`
2
+ [2025-10-11 18:22:13][train:375][INFO] Configuration:
3
+ [2025-10-11 18:22:13][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/transformer_2_4_256/config.yaml.
4
+ [2025-10-11 18:22:13][train:387][INFO] creating datamodule
5
+ [2025-10-11 18:22:13][train:419][INFO] creating model
6
+ [2025-10-11 18:22:14][train:440][INFO] creating optimizer
7
+ [2025-10-11 18:22:14][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-11 18:22:14][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-11 18:22:14][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-11 18:22:15][logger:288][INFO] wandb initialized. Run id: 7pqyfnns
11
+ [2025-10-11 18:22:15][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/resume.jsonl since we are not resuming
13
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/train_data_info.jsonl since we are not resuming
14
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/val_data_info.jsonl since we are not resuming
15
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/model_info.jsonl since we are not resuming
16
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/train.jsonl since we are not resuming
17
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/throughput.jsonl since we are not resuming
18
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/norm.jsonl since we are not resuming
19
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/val.jsonl since we are not resuming
20
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/train_eval.jsonl since we are not resuming
21
+ [2025-10-11 18:22:15][logger:199][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/jsonlines/checkpoint.jsonl since we are not resuming
22
+ [2025-10-11 18:22:15][logger:113][INFO] Setting up npz logger...
23
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000041943040.npz since we are not resuming
24
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000083886080.npz since we are not resuming
25
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000125829120.npz since we are not resuming
26
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000167772160.npz since we are not resuming
27
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000209715200.npz since we are not resuming
28
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000251658240.npz since we are not resuming
29
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000293601280.npz since we are not resuming
30
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000335544320.npz since we are not resuming
31
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000377487360.npz since we are not resuming
32
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000419430400.npz since we are not resuming
33
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000461373440.npz since we are not resuming
34
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000503316480.npz since we are not resuming
35
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000545259520.npz since we are not resuming
36
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000587202560.npz since we are not resuming
37
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000629145600.npz since we are not resuming
38
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000671088640.npz since we are not resuming
39
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000713031680.npz since we are not resuming
40
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000754974720.npz since we are not resuming
41
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000796917760.npz since we are not resuming
42
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000838860800.npz since we are not resuming
43
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000880803840.npz since we are not resuming
44
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000922746880.npz since we are not resuming
45
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000000964689920.npz since we are not resuming
46
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001006632960.npz since we are not resuming
47
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001048576000.npz since we are not resuming
48
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001090519040.npz since we are not resuming
49
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001132462080.npz since we are not resuming
50
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001174405120.npz since we are not resuming
51
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001216348160.npz since we are not resuming
52
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001258291200.npz since we are not resuming
53
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001300234240.npz since we are not resuming
54
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001342177280.npz since we are not resuming
55
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001384120320.npz since we are not resuming
56
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001426063360.npz since we are not resuming
57
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001468006400.npz since we are not resuming
58
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001509949440.npz since we are not resuming
59
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001551892480.npz since we are not resuming
60
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001593835520.npz since we are not resuming
61
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001635778560.npz since we are not resuming
62
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001677721600.npz since we are not resuming
63
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001719664640.npz since we are not resuming
64
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001761607680.npz since we are not resuming
65
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001803550720.npz since we are not resuming
66
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001845493760.npz since we are not resuming
67
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001887436800.npz since we are not resuming
68
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001929379840.npz since we are not resuming
69
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000001971322880.npz since we are not resuming
70
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000002013265920.npz since we are not resuming
71
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000002055208960.npz since we are not resuming
72
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/val/step-000002097152000.npz since we are not resuming
73
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000104857600.npz since we are not resuming
74
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000209715200.npz since we are not resuming
75
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000314572800.npz since we are not resuming
76
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000419430400.npz since we are not resuming
77
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000524288000.npz since we are not resuming
78
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000629145600.npz since we are not resuming
79
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000734003200.npz since we are not resuming
80
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000838860800.npz since we are not resuming
81
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000000943718400.npz since we are not resuming
82
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001048576000.npz since we are not resuming
83
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001153433600.npz since we are not resuming
84
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001258291200.npz since we are not resuming
85
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001363148800.npz since we are not resuming
86
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001468006400.npz since we are not resuming
87
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001572864000.npz since we are not resuming
88
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001677721600.npz since we are not resuming
89
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001782579200.npz since we are not resuming
90
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001887436800.npz since we are not resuming
91
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000001992294400.npz since we are not resuming
92
+ [2025-10-11 18:22:15][logger:127][INFO] Deleting /workspace/forgetting-transformer/transformer_2_4_256/metrics/npz/train_eval/step-000002097152000.npz since we are not resuming
93
+ [2025-10-11 18:22:15][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
94
+ [2025-10-11 18:22:15][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
95
+ [2025-10-11 18:22:15][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128]
96
+ [2025-10-11 18:22:54][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:38] [ETA: 1:03:37] [loss: 9.773] [tokens/s: 603077.682] [batches/s: 0.288] [MFU: 0.000] [TFLOPS: 0.000]
97
+ [2025-10-11 18:23:29][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:13] [ETA: 0:59:51] [loss: 8.168] [tokens/s: 603404.871] [batches/s: 0.288] [MFU: 0.000] [TFLOPS: 0.000]
98
+ [2025-10-11 18:23:29][train:194][INFO] Running validation...
99
+ [2025-10-11 18:24:52][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 73.294] [val/train_update_time: 72.966] [val/loss: 8.067] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.724] [val/val_tokens_per_second: 489228.498] [val/loss_avg_len_2048: 8.067] [val/perplexity_len_2048: 3186.330] [val/loss_avg_len_1024: 8.063] [val/perplexity_len_1024: 3176.025] [val/loss_avg_len_512: 8.063] [val/perplexity_len_512: 3175.293]
100
+ [2025-10-11 18:25:27][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:03:11] [ETA: 1:43:21] [loss: 7.603] [tokens/s: 329572.484] [batches/s: 0.157] [MFU: 0.000] [TFLOPS: 0.000]
101
+ [2025-10-11 18:26:02][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:03:46] [ETA: 1:30:36] [loss: 7.274] [tokens/s: 372993.504] [batches/s: 0.178] [MFU: 0.000] [TFLOPS: 0.000]
102
+ [2025-10-11 18:26:02][train:194][INFO] Running validation...
103
+ [2025-10-11 18:27:26][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 226.536] [val/train_update_time: 142.216] [val/loss: 7.245] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.879] [val/val_tokens_per_second: 488322.377] [val/loss_avg_len_2048: 7.245] [val/perplexity_len_2048: 1401.492] [val/loss_avg_len_1024: 7.242] [val/perplexity_len_1024: 1397.308] [val/loss_avg_len_512: 7.246] [val/perplexity_len_512: 1402.335]
104
+ [2025-10-11 18:28:01][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:05:45] [ETA: 1:49:18] [loss: 7.020] [tokens/s: 304108.020] [batches/s: 0.145] [MFU: 0.000] [TFLOPS: 0.000]
105
+ [2025-10-11 18:28:01][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 345.167] [train_eval/train_update_time: 176.836] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.311] [train_eval/perplexity_len_2048: 4068.811] [train_eval/loss_avg_len_1024: 8.310] [train_eval/perplexity_len_1024: 4065.728] [train_eval/loss_avg_len_512: 8.312] [train_eval/perplexity_len_512: 4070.792]
106
+ [2025-10-11 18:28:35][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:06:19] [ETA: 1:39:12] [loss: 6.767] [tokens/s: 332023.735] [batches/s: 0.158] [MFU: 0.000] [TFLOPS: 0.000]
107
+ [2025-10-11 18:28:35][train:194][INFO] Running validation...
108
+ [2025-10-11 18:29:59][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 379.919] [val/train_update_time: 211.447] [val/loss: 6.769] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.947] [val/val_tokens_per_second: 487928.793] [val/loss_avg_len_2048: 6.769] [val/perplexity_len_2048: 870.420] [val/loss_avg_len_1024: 6.765] [val/perplexity_len_1024: 867.369] [val/loss_avg_len_512: 6.771] [val/perplexity_len_512: 872.359]
109
+ [2025-10-11 18:30:34][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:08:18] [ETA: 1:50:24] [loss: 6.605] [tokens/s: 294498.470] [batches/s: 0.140] [MFU: 0.000] [TFLOPS: 0.000]
110
+ [2025-10-11 18:31:09][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:08:53] [ETA: 1:42:13] [loss: 6.441] [tokens/s: 314916.038] [batches/s: 0.150] [MFU: 0.000] [TFLOPS: 0.000]
111
+ [2025-10-11 18:31:09][train:194][INFO] Running validation...
112
+ [2025-10-11 18:32:33][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 533.352] [val/train_update_time: 280.672] [val/loss: 6.397] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.748] [val/val_tokens_per_second: 489088.636] [val/loss_avg_len_2048: 6.397] [val/perplexity_len_2048: 600.142] [val/loss_avg_len_1024: 6.396] [val/perplexity_len_1024: 599.163] [val/loss_avg_len_512: 6.406] [val/perplexity_len_512: 605.769]
113
+ [2025-10-11 18:33:07][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:10:51] [ETA: 1:49:51] [loss: 6.236] [tokens/s: 289551.070] [batches/s: 0.138] [MFU: 0.000] [TFLOPS: 0.000]
114
+ [2025-10-11 18:33:42][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:11:26] [ETA: 1:42:59] [loss: 6.122] [tokens/s: 305610.676] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
115
+ [2025-10-11 18:33:42][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 686.614] [train_eval/train_update_time: 349.902] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.520] [train_eval/perplexity_len_2048: 678.573] [train_eval/loss_avg_len_1024: 6.520] [train_eval/perplexity_len_1024: 678.732] [train_eval/loss_avg_len_512: 6.529] [train_eval/perplexity_len_512: 684.423]
116
+ [2025-10-11 18:33:42][train:194][INFO] Running validation...
117
+ [2025-10-11 18:35:06][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 686.614] [val/train_update_time: 349.902] [val/loss: 6.105] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.813] [val/val_tokens_per_second: 488705.727] [val/loss_avg_len_2048: 6.105] [val/perplexity_len_2048: 448.202] [val/loss_avg_len_1024: 6.107] [val/perplexity_len_1024: 448.853] [val/loss_avg_len_512: 6.123] [val/perplexity_len_512: 456.028]
118
+ [2025-10-11 18:35:06][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000209715200.pt...
119
+ [2025-10-11 18:35:06][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000209715200.pt.
120
+ [2025-10-11 18:35:06][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.447]
121
+ [2025-10-11 18:35:41][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:13:25] [ETA: 1:48:38] [loss: 5.998] [tokens/s: 271889.561] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
122
+ [2025-10-11 18:36:16][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:14:00] [ETA: 1:42:42] [loss: 5.855] [tokens/s: 305368.658] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
123
+ [2025-10-11 18:36:16][train:194][INFO] Running validation...
124
+ [2025-10-11 18:37:40][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 840.405] [val/train_update_time: 419.145] [val/loss: 5.873] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.901] [val/val_tokens_per_second: 488192.483] [val/loss_avg_len_2048: 5.873] [val/perplexity_len_2048: 355.404] [val/loss_avg_len_1024: 5.877] [val/perplexity_len_1024: 356.582] [val/loss_avg_len_512: 5.895] [val/perplexity_len_512: 363.294]
125
+ [2025-10-11 18:38:15][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:15:59] [ETA: 1:46:58] [loss: 5.793] [tokens/s: 271811.835] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
126
+ [2025-10-11 18:38:49][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:16:33] [ETA: 1:41:44] [loss: 5.713] [tokens/s: 305351.869] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
127
+ [2025-10-11 18:38:49][train:194][INFO] Running validation...
128
+ [2025-10-11 18:40:12][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 993.830] [val/train_update_time: 488.409] [val/loss: 5.695] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.151] [val/val_tokens_per_second: 492597.127] [val/loss_avg_len_2048: 5.695] [val/perplexity_len_2048: 297.478] [val/loss_avg_len_1024: 5.701] [val/perplexity_len_1024: 299.110] [val/loss_avg_len_512: 5.721] [val/perplexity_len_512: 305.264]
129
+ [2025-10-11 18:40:47][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:18:31] [ETA: 1:44:59] [loss: 5.581] [tokens/s: 272077.586] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
130
+ [2025-10-11 18:40:47][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1111.739] [train_eval/train_update_time: 523.030] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.841] [train_eval/perplexity_len_2048: 344.221] [train_eval/loss_avg_len_1024: 5.846] [train_eval/perplexity_len_1024: 345.882] [train_eval/loss_avg_len_512: 5.864] [train_eval/perplexity_len_512: 352.119]
131
+ [2025-10-11 18:41:22][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:19:06] [ETA: 1:40:19] [loss: 5.538] [tokens/s: 305715.175] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
132
+ [2025-10-11 18:41:22][train:194][INFO] Running validation...
133
+ [2025-10-11 18:42:45][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1146.476] [val/train_update_time: 557.623] [val/loss: 5.523] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.126] [val/val_tokens_per_second: 492748.069] [val/loss_avg_len_2048: 5.523] [val/perplexity_len_2048: 250.341] [val/loss_avg_len_1024: 5.531] [val/perplexity_len_1024: 252.272] [val/loss_avg_len_512: 5.553] [val/perplexity_len_512: 258.087]
134
+ [2025-10-11 18:43:20][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:21:04] [ETA: 1:42:53] [loss: 5.430] [tokens/s: 272369.407] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
135
+ [2025-10-11 18:43:55][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:21:39] [ETA: 1:38:38] [loss: 5.393] [tokens/s: 305993.804] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
136
+ [2025-10-11 18:43:55][train:194][INFO] Running validation...
137
+ [2025-10-11 18:45:18][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1299.095] [val/train_update_time: 626.843] [val/loss: 5.388] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.263] [val/val_tokens_per_second: 491935.403] [val/loss_avg_len_2048: 5.388] [val/perplexity_len_2048: 218.701] [val/loss_avg_len_1024: 5.396] [val/perplexity_len_1024: 220.525] [val/loss_avg_len_512: 5.420] [val/perplexity_len_512: 225.771]
138
+ [2025-10-11 18:45:53][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:23:37] [ETA: 1:40:42] [loss: 5.366] [tokens/s: 272483.112] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
139
+ [2025-10-11 18:46:27][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:24:12] [ETA: 1:36:48] [loss: 5.265] [tokens/s: 306369.458] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
140
+ [2025-10-11 18:46:27][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1452.039] [train_eval/train_update_time: 696.240] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.426] [train_eval/perplexity_len_2048: 227.299] [train_eval/loss_avg_len_1024: 5.435] [train_eval/perplexity_len_1024: 229.249] [train_eval/loss_avg_len_512: 5.457] [train_eval/perplexity_len_512: 234.370]
141
+ [2025-10-11 18:46:27][train:194][INFO] Running validation...
142
+ [2025-10-11 18:47:51][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1452.039] [val/train_update_time: 696.240] [val/loss: 5.268] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.219] [val/val_tokens_per_second: 492196.721] [val/loss_avg_len_2048: 5.268] [val/perplexity_len_2048: 193.983] [val/loss_avg_len_1024: 5.278] [val/perplexity_len_1024: 195.923] [val/loss_avg_len_512: 5.303] [val/perplexity_len_512: 200.900]
143
+ [2025-10-11 18:47:51][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000419430400.pt...
144
+ [2025-10-11 18:47:51][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000419430400.pt.
145
+ [2025-10-11 18:47:51][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.436]
146
+ [2025-10-11 18:48:26][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:26:10] [ETA: 1:38:27] [loss: 5.229] [tokens/s: 272702.157] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
147
+ [2025-10-11 18:49:01][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:26:45] [ETA: 1:34:51] [loss: 5.191] [tokens/s: 306487.143] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
148
+ [2025-10-11 18:49:01][train:194][INFO] Running validation...
149
+ [2025-10-11 18:50:24][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 1605.222] [val/train_update_time: 765.490] [val/loss: 5.169] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.114] [val/val_tokens_per_second: 492816.400] [val/loss_avg_len_2048: 5.169] [val/perplexity_len_2048: 175.706] [val/loss_avg_len_1024: 5.179] [val/perplexity_len_1024: 177.538] [val/loss_avg_len_512: 5.205] [val/perplexity_len_512: 182.153]
150
+ [2025-10-11 18:50:59][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:28:43] [ETA: 1:36:08] [loss: 5.120] [tokens/s: 272999.729] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
151
+ [2025-10-11 18:51:33][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:29:17] [ETA: 1:32:46] [loss: 5.060] [tokens/s: 306496.656] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
152
+ [2025-10-11 18:51:33][train:194][INFO] Running validation...
153
+ [2025-10-11 18:52:56][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 1757.860] [val/train_update_time: 834.736] [val/loss: 5.079] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.139] [val/val_tokens_per_second: 492668.686] [val/loss_avg_len_2048: 5.079] [val/perplexity_len_2048: 160.642] [val/loss_avg_len_1024: 5.090] [val/perplexity_len_1024: 162.465] [val/loss_avg_len_512: 5.117] [val/perplexity_len_512: 166.831]
154
+ [2025-10-11 18:53:31][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:31:15] [ETA: 1:33:47] [loss: 5.056] [tokens/s: 272986.995] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
155
+ [2025-10-11 18:53:31][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1875.768] [train_eval/train_update_time: 869.364] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.151] [train_eval/perplexity_len_2048: 172.569] [train_eval/loss_avg_len_1024: 5.160] [train_eval/perplexity_len_1024: 174.145] [train_eval/loss_avg_len_512: 5.184] [train_eval/perplexity_len_512: 178.345]
156
+ [2025-10-11 18:54:06][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:31:50] [ETA: 1:30:37] [loss: 5.006] [tokens/s: 306481.407] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
157
+ [2025-10-11 18:54:06][train:194][INFO] Running validation...
158
+ [2025-10-11 18:55:29][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 1910.516] [val/train_update_time: 903.970] [val/loss: 5.007] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.178] [val/val_tokens_per_second: 492436.321] [val/loss_avg_len_2048: 5.007] [val/perplexity_len_2048: 149.392] [val/loss_avg_len_1024: 5.019] [val/perplexity_len_1024: 151.218] [val/loss_avg_len_512: 5.046] [val/perplexity_len_512: 155.350]
159
+ [2025-10-11 18:56:04][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:33:48] [ETA: 1:31:24] [loss: 4.980] [tokens/s: 272966.915] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
160
+ [2025-10-11 18:56:39][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:34:23] [ETA: 1:28:25] [loss: 4.937] [tokens/s: 306516.134] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
161
+ [2025-10-11 18:56:39][train:194][INFO] Running validation...
162
+ [2025-10-11 18:58:02][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 2063.196] [val/train_update_time: 973.209] [val/loss: 4.939] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.110] [val/val_tokens_per_second: 492840.705] [val/loss_avg_len_2048: 4.939] [val/perplexity_len_2048: 139.689] [val/loss_avg_len_1024: 4.952] [val/perplexity_len_1024: 141.455] [val/loss_avg_len_512: 4.979] [val/perplexity_len_512: 145.340]
163
+ [2025-10-11 18:58:37][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:36:21] [ETA: 1:28:59] [loss: 4.895] [tokens/s: 273079.385] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
164
+ [2025-10-11 18:59:11][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:36:55] [ETA: 1:26:10] [loss: 4.873] [tokens/s: 306834.003] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
165
+ [2025-10-11 18:59:11][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2215.832] [train_eval/train_update_time: 1042.442] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.964] [train_eval/perplexity_len_2048: 143.168] [train_eval/loss_avg_len_1024: 4.973] [train_eval/perplexity_len_1024: 144.463] [train_eval/loss_avg_len_512: 4.997] [train_eval/perplexity_len_512: 148.005]
166
+ [2025-10-11 18:59:11][train:194][INFO] Running validation...
167
+ [2025-10-11 19:00:34][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 2215.832] [val/train_update_time: 1042.442] [val/loss: 4.881] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.080] [val/val_tokens_per_second: 493016.170] [val/loss_avg_len_2048: 4.881] [val/perplexity_len_2048: 131.756] [val/loss_avg_len_1024: 4.894] [val/perplexity_len_1024: 133.429] [val/loss_avg_len_512: 4.921] [val/perplexity_len_512: 137.133]
168
+ [2025-10-11 19:00:34][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000629145600.pt...
169
+ [2025-10-11 19:00:35][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000629145600.pt.
170
+ [2025-10-11 19:00:35][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.440]
171
+ [2025-10-11 19:01:10][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:38:54] [ETA: 1:26:35] [loss: 4.887] [tokens/s: 273119.395] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
172
+ [2025-10-11 19:01:44][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:39:28] [ETA: 1:23:53] [loss: 4.818] [tokens/s: 306647.732] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
173
+ [2025-10-11 19:01:44][train:194][INFO] Running validation...
174
+ [2025-10-11 19:03:07][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 2368.879] [val/train_update_time: 1111.683] [val/loss: 4.833] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.095] [val/val_tokens_per_second: 492927.703] [val/loss_avg_len_2048: 4.833] [val/perplexity_len_2048: 125.533] [val/loss_avg_len_1024: 4.846] [val/perplexity_len_1024: 127.187] [val/loss_avg_len_512: 4.873] [val/perplexity_len_512: 130.752]
175
+ [2025-10-11 19:03:42][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:41:26] [ETA: 1:24:08] [loss: 4.830] [tokens/s: 273124.311] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
176
+ [2025-10-11 19:04:17][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:42:01] [ETA: 1:21:34] [loss: 4.790] [tokens/s: 306668.831] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
177
+ [2025-10-11 19:04:17][train:194][INFO] Running validation...
178
+ [2025-10-11 19:05:40][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 2521.492] [val/train_update_time: 1180.932] [val/loss: 4.790] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.172] [val/val_tokens_per_second: 492473.163] [val/loss_avg_len_2048: 4.790] [val/perplexity_len_2048: 120.287] [val/loss_avg_len_1024: 4.803] [val/perplexity_len_1024: 121.891] [val/loss_avg_len_512: 4.831] [val/perplexity_len_512: 125.291]
179
+ [2025-10-11 19:06:15][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:43:59] [ETA: 1:21:41] [loss: 4.776] [tokens/s: 273125.433] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
180
+ [2025-10-11 19:06:15][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2639.418] [train_eval/train_update_time: 1215.554] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.824] [train_eval/perplexity_len_2048: 124.432] [train_eval/loss_avg_len_1024: 4.836] [train_eval/perplexity_len_1024: 125.926] [train_eval/loss_avg_len_512: 4.861] [train_eval/perplexity_len_512: 129.172]
181
+ [2025-10-11 19:06:50][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 0:44:34] [ETA: 1:19:14] [loss: 4.703] [tokens/s: 306680.372] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
182
+ [2025-10-11 19:06:50][train:194][INFO] Running validation...
183
+ [2025-10-11 19:08:13][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 2674.171] [val/train_update_time: 1250.169] [val/loss: 4.743] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.356] [val/val_tokens_per_second: 491384.324] [val/loss_avg_len_2048: 4.743] [val/perplexity_len_2048: 114.805] [val/loss_avg_len_1024: 4.757] [val/perplexity_len_1024: 116.411] [val/loss_avg_len_512: 4.785] [val/perplexity_len_512: 119.713]
184
+ [2025-10-11 19:08:48][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 0:46:32] [ETA: 1:19:14] [loss: 4.724] [tokens/s: 273051.613] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
185
+ [2025-10-11 19:09:22][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 0:47:07] [ETA: 1:16:52] [loss: 4.700] [tokens/s: 306572.237] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
186
+ [2025-10-11 19:09:22][train:194][INFO] Running validation...
187
+ [2025-10-11 19:10:46][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 2827.018] [val/train_update_time: 1319.377] [val/loss: 4.708] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.290] [val/val_tokens_per_second: 491776.322] [val/loss_avg_len_2048: 4.708] [val/perplexity_len_2048: 110.851] [val/loss_avg_len_1024: 4.722] [val/perplexity_len_1024: 112.405] [val/loss_avg_len_512: 4.750] [val/perplexity_len_512: 115.608]
188
+ [2025-10-11 19:11:21][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 0:49:05] [ETA: 1:16:46] [loss: 4.711] [tokens/s: 272980.584] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
189
+ [2025-10-11 19:11:55][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 0:49:39] [ETA: 1:14:29] [loss: 4.619] [tokens/s: 306684.637] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
190
+ [2025-10-11 19:11:55][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2979.820] [train_eval/train_update_time: 1388.590] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.722] [train_eval/perplexity_len_2048: 112.354] [train_eval/loss_avg_len_1024: 4.733] [train_eval/perplexity_len_1024: 113.626] [train_eval/loss_avg_len_512: 4.758] [train_eval/perplexity_len_512: 116.555]
191
+ [2025-10-11 19:11:55][train:194][INFO] Running validation...
192
+ [2025-10-11 19:13:19][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 2979.820] [val/train_update_time: 1388.590] [val/loss: 4.676] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.660] [val/val_tokens_per_second: 489599.103] [val/loss_avg_len_2048: 4.676] [val/perplexity_len_2048: 107.362] [val/loss_avg_len_1024: 4.691] [val/perplexity_len_1024: 108.916] [val/loss_avg_len_512: 4.719] [val/perplexity_len_512: 112.042]
193
+ [2025-10-11 19:13:19][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000838860800.pt...
194
+ [2025-10-11 19:13:19][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000000838860800.pt.
195
+ [2025-10-11 19:13:19][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.445]
196
+ [2025-10-11 19:13:54][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 0:51:38] [ETA: 1:14:19] [loss: 4.639] [tokens/s: 272787.535] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
197
+ [2025-10-11 19:14:29][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 0:52:13] [ETA: 1:12:07] [loss: 4.639] [tokens/s: 306227.933] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
198
+ [2025-10-11 19:14:29][train:194][INFO] Running validation...
199
+ [2025-10-11 19:15:53][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 3133.448] [val/train_update_time: 1457.837] [val/loss: 4.645] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.890] [val/val_tokens_per_second: 488258.606] [val/loss_avg_len_2048: 4.645] [val/perplexity_len_2048: 104.074] [val/loss_avg_len_1024: 4.660] [val/perplexity_len_1024: 105.603] [val/loss_avg_len_512: 4.688] [val/perplexity_len_512: 108.651]
200
+ [2025-10-11 19:16:28][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 0:54:12] [ETA: 1:11:50] [loss: 4.631] [tokens/s: 272508.815] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
201
+ [2025-10-11 19:17:02][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 0:54:46] [ETA: 1:09:43] [loss: 4.657] [tokens/s: 305908.080] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
202
+ [2025-10-11 19:17:02][train:194][INFO] Running validation...
203
+ [2025-10-11 19:18:26][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 3286.849] [val/train_update_time: 1527.068] [val/loss: 4.622] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.674] [val/val_tokens_per_second: 489518.085] [val/loss_avg_len_2048: 4.622] [val/perplexity_len_2048: 101.723] [val/loss_avg_len_1024: 4.637] [val/perplexity_len_1024: 103.258] [val/loss_avg_len_512: 4.666] [val/perplexity_len_512: 106.264]
204
+ [2025-10-11 19:19:01][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 0:56:45] [ETA: 1:09:22] [loss: 4.604] [tokens/s: 272329.416] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
205
+ [2025-10-11 19:19:01][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3405.286] [train_eval/train_update_time: 1561.695] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.642] [train_eval/perplexity_len_2048: 103.737] [train_eval/loss_avg_len_1024: 4.656] [train_eval/perplexity_len_1024: 105.218] [train_eval/loss_avg_len_512: 4.684] [train_eval/perplexity_len_512: 108.214]
206
+ [2025-10-11 19:19:36][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 0:57:20] [ETA: 1:07:18] [loss: 4.574] [tokens/s: 305721.981] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
207
+ [2025-10-11 19:19:36][train:194][INFO] Running validation...
208
+ [2025-10-11 19:20:59][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 3440.122] [val/train_update_time: 1596.400] [val/loss: 4.595] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.088] [val/val_tokens_per_second: 492973.221] [val/loss_avg_len_2048: 4.595] [val/perplexity_len_2048: 98.979] [val/loss_avg_len_1024: 4.610] [val/perplexity_len_1024: 100.481] [val/loss_avg_len_512: 4.639] [val/perplexity_len_512: 103.400]
209
+ [2025-10-11 19:21:33][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 0:59:17] [ETA: 1:06:52] [loss: 4.603] [tokens/s: 272392.626] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
210
+ [2025-10-11 19:22:08][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 0:59:52] [ETA: 1:04:52] [loss: 4.579] [tokens/s: 305786.877] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
211
+ [2025-10-11 19:22:08][train:194][INFO] Running validation...
212
+ [2025-10-11 19:23:31][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 3592.762] [val/train_update_time: 1665.690] [val/loss: 4.576] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.213] [val/val_tokens_per_second: 492228.525] [val/loss_avg_len_2048: 4.576] [val/perplexity_len_2048: 97.115] [val/loss_avg_len_1024: 4.591] [val/perplexity_len_1024: 98.559] [val/loss_avg_len_512: 4.619] [val/perplexity_len_512: 101.419]
213
+ [2025-10-11 19:24:06][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:01:50] [ETA: 1:04:22] [loss: 4.574] [tokens/s: 272415.311] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
214
+ [2025-10-11 19:24:41][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:02:25] [ETA: 1:02:25] [loss: 4.552] [tokens/s: 306176.351] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
215
+ [2025-10-11 19:24:41][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3745.514] [train_eval/train_update_time: 1734.943] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.581] [train_eval/perplexity_len_2048: 97.609] [train_eval/loss_avg_len_1024: 4.594] [train_eval/perplexity_len_1024: 98.888] [train_eval/loss_avg_len_512: 4.622] [train_eval/perplexity_len_512: 101.721]
216
+ [2025-10-11 19:24:41][train:194][INFO] Running validation...
217
+ [2025-10-11 19:26:04][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 3745.514] [val/train_update_time: 1734.943] [val/loss: 4.557] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.290] [val/val_tokens_per_second: 491774.305] [val/loss_avg_len_2048: 4.557] [val/perplexity_len_2048: 95.273] [val/loss_avg_len_1024: 4.572] [val/perplexity_len_1024: 96.744] [val/loss_avg_len_512: 4.601] [val/perplexity_len_512: 99.596]
218
+ [2025-10-11 19:26:04][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001048576000.pt...
219
+ [2025-10-11 19:26:05][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001048576000.pt.
220
+ [2025-10-11 19:26:05][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.448]
221
+ [2025-10-11 19:26:39][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:04:24] [ETA: 1:01:52] [loss: 4.556] [tokens/s: 272531.454] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
222
+ [2025-10-11 19:27:14][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:04:58] [ETA: 0:59:58] [loss: 4.551] [tokens/s: 306267.878] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
223
+ [2025-10-11 19:27:14][train:194][INFO] Running validation...
224
+ [2025-10-11 19:28:37][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 3898.721] [val/train_update_time: 1804.150] [val/loss: 4.540] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.143] [val/val_tokens_per_second: 492646.918] [val/loss_avg_len_2048: 4.540] [val/perplexity_len_2048: 93.677] [val/loss_avg_len_1024: 4.555] [val/perplexity_len_1024: 95.132] [val/loss_avg_len_512: 4.584] [val/perplexity_len_512: 97.952]
225
+ [2025-10-11 19:29:12][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:06:56] [ETA: 0:59:21] [loss: 4.509] [tokens/s: 272811.607] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
226
+ [2025-10-11 19:29:47][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:07:31] [ETA: 0:57:31] [loss: 4.518] [tokens/s: 306522.025] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
227
+ [2025-10-11 19:29:47][train:194][INFO] Running validation...
228
+ [2025-10-11 19:31:10][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 4051.347] [val/train_update_time: 1873.369] [val/loss: 4.519] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.450] [val/val_tokens_per_second: 490830.500] [val/loss_avg_len_2048: 4.519] [val/perplexity_len_2048: 91.751] [val/loss_avg_len_1024: 4.535] [val/perplexity_len_1024: 93.194] [val/loss_avg_len_512: 4.564] [val/perplexity_len_512: 95.948]
229
+ [2025-10-11 19:31:45][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:09:29] [ETA: 0:56:51] [loss: 4.477] [tokens/s: 272931.742] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
230
+ [2025-10-11 19:31:45][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4169.555] [train_eval/train_update_time: 1907.991] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.528] [train_eval/perplexity_len_2048: 92.594] [train_eval/loss_avg_len_1024: 4.539] [train_eval/perplexity_len_1024: 93.629] [train_eval/loss_avg_len_512: 4.565] [train_eval/perplexity_len_512: 96.098]
231
+ [2025-10-11 19:32:20][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:10:04] [ETA: 0:55:03] [loss: 4.517] [tokens/s: 306408.239] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
232
+ [2025-10-11 19:32:20][train:194][INFO] Running validation...
233
+ [2025-10-11 19:33:44][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 4204.282] [val/train_update_time: 1942.595] [val/loss: 4.504] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.983] [val/val_tokens_per_second: 487718.620] [val/loss_avg_len_2048: 4.504] [val/perplexity_len_2048: 90.391] [val/loss_avg_len_1024: 4.520] [val/perplexity_len_1024: 91.834] [val/loss_avg_len_512: 4.549] [val/perplexity_len_512: 94.584]
234
+ [2025-10-11 19:34:18][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:12:03] [ETA: 0:54:21] [loss: 4.482] [tokens/s: 272619.612] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
235
+ [2025-10-11 19:34:53][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:12:37] [ETA: 0:52:35] [loss: 4.521] [tokens/s: 306087.029] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
236
+ [2025-10-11 19:34:53][train:194][INFO] Running validation...
237
+ [2025-10-11 19:36:17][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 4357.763] [val/train_update_time: 2011.835] [val/loss: 4.488] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.390] [val/val_tokens_per_second: 491188.527] [val/loss_avg_len_2048: 4.488] [val/perplexity_len_2048: 88.920] [val/loss_avg_len_1024: 4.503] [val/perplexity_len_1024: 90.322] [val/loss_avg_len_512: 4.533] [val/perplexity_len_512: 93.026]
238
+ [2025-10-11 19:36:51][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:14:35] [ETA: 0:51:50] [loss: 4.499] [tokens/s: 272585.017] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
239
+ [2025-10-11 19:37:26][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:15:10] [ETA: 0:50:07] [loss: 4.498] [tokens/s: 306265.451] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
240
+ [2025-10-11 19:37:26][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4510.644] [train_eval/train_update_time: 2081.064] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.490] [train_eval/perplexity_len_2048: 89.091] [train_eval/loss_avg_len_1024: 4.501] [train_eval/perplexity_len_1024: 90.065] [train_eval/loss_avg_len_512: 4.527] [train_eval/perplexity_len_512: 92.497]
241
+ [2025-10-11 19:37:26][train:194][INFO] Running validation...
242
+ [2025-10-11 19:38:50][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 4510.644] [val/train_update_time: 2081.064] [val/loss: 4.478] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.903] [val/val_tokens_per_second: 488180.556] [val/loss_avg_len_2048: 4.478] [val/perplexity_len_2048: 88.043] [val/loss_avg_len_1024: 4.494] [val/perplexity_len_1024: 89.441] [val/loss_avg_len_512: 4.523] [val/perplexity_len_512: 92.125]
243
+ [2025-10-11 19:38:50][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001258291200.pt...
244
+ [2025-10-11 19:38:50][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001258291200.pt.
245
+ [2025-10-11 19:38:50][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.445]
246
+ [2025-10-11 19:39:25][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:17:09] [ETA: 0:49:20] [loss: 4.480] [tokens/s: 272370.966] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
247
+ [2025-10-11 19:40:00][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:17:44] [ETA: 0:47:38] [loss: 4.459] [tokens/s: 305691.748] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
248
+ [2025-10-11 19:40:00][train:194][INFO] Running validation...
249
+ [2025-10-11 19:41:24][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 4664.526] [val/train_update_time: 2150.357] [val/loss: 4.465] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.582] [val/val_tokens_per_second: 490057.066] [val/loss_avg_len_2048: 4.465] [val/perplexity_len_2048: 86.891] [val/loss_avg_len_1024: 4.481] [val/perplexity_len_1024: 88.302] [val/loss_avg_len_512: 4.511] [val/perplexity_len_512: 90.981]
250
+ [2025-10-11 19:41:58][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:19:42] [ETA: 0:46:48] [loss: 4.471] [tokens/s: 272186.782] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
251
+ [2025-10-11 19:42:33][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:20:17] [ETA: 0:45:09] [loss: 4.470] [tokens/s: 305621.049] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
252
+ [2025-10-11 19:42:33][train:194][INFO] Running validation...
253
+ [2025-10-11 19:43:57][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 4817.616] [val/train_update_time: 2219.606] [val/loss: 4.453] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.616] [val/val_tokens_per_second: 489856.014] [val/loss_avg_len_2048: 4.453] [val/perplexity_len_2048: 85.883] [val/loss_avg_len_1024: 4.469] [val/perplexity_len_1024: 87.290] [val/loss_avg_len_512: 4.499] [val/perplexity_len_512: 89.953]
254
+ [2025-10-11 19:44:31][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:22:15] [ETA: 0:44:17] [loss: 4.449] [tokens/s: 272126.727] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
255
+ [2025-10-11 19:44:31][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4935.988] [train_eval/train_update_time: 2254.240] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.466] [train_eval/perplexity_len_2048: 86.967] [train_eval/loss_avg_len_1024: 4.481] [train_eval/perplexity_len_1024: 88.316] [train_eval/loss_avg_len_512: 4.508] [train_eval/perplexity_len_512: 90.750]
256
+ [2025-10-11 19:45:06][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:22:50] [ETA: 0:42:40] [loss: 4.462] [tokens/s: 305780.593] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
257
+ [2025-10-11 19:45:06][train:194][INFO] Running validation...
258
+ [2025-10-11 19:46:30][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 4970.728] [val/train_update_time: 2288.853] [val/loss: 4.444] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.694] [val/val_tokens_per_second: 489402.924] [val/loss_avg_len_2048: 4.444] [val/perplexity_len_2048: 85.101] [val/loss_avg_len_1024: 4.460] [val/perplexity_len_1024: 86.496] [val/loss_avg_len_512: 4.490] [val/perplexity_len_512: 89.153]
259
+ [2025-10-11 19:47:05][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:24:49] [ETA: 0:41:46] [loss: 4.417] [tokens/s: 272222.213] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
260
+ [2025-10-11 19:47:39][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:25:23] [ETA: 0:40:11] [loss: 4.435] [tokens/s: 305648.253] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
261
+ [2025-10-11 19:47:39][train:194][INFO] Running validation...
262
+ [2025-10-11 19:49:03][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 5123.910] [val/train_update_time: 2358.096] [val/loss: 4.435] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.679] [val/val_tokens_per_second: 489488.811] [val/loss_avg_len_2048: 4.435] [val/perplexity_len_2048: 84.344] [val/loss_avg_len_1024: 4.451] [val/perplexity_len_1024: 85.720] [val/loss_avg_len_512: 4.481] [val/perplexity_len_512: 88.349]
263
+ [2025-10-11 19:49:38][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:27:22] [ETA: 0:39:15] [loss: 4.441] [tokens/s: 272114.326] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
264
+ [2025-10-11 19:50:13][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:27:57] [ETA: 0:37:41] [loss: 4.434] [tokens/s: 305948.726] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
265
+ [2025-10-11 19:50:13][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5277.087] [train_eval/train_update_time: 2427.330] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.437] [train_eval/perplexity_len_2048: 84.548] [train_eval/loss_avg_len_1024: 4.452] [train_eval/perplexity_len_1024: 85.836] [train_eval/loss_avg_len_512: 4.482] [train_eval/perplexity_len_512: 88.394]
266
+ [2025-10-11 19:50:13][train:194][INFO] Running validation...
267
+ [2025-10-11 19:51:36][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 5277.087] [val/train_update_time: 2427.330] [val/loss: 4.427] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.180] [val/val_tokens_per_second: 492425.017] [val/loss_avg_len_2048: 4.427] [val/perplexity_len_2048: 83.655] [val/loss_avg_len_1024: 4.443] [val/perplexity_len_1024: 85.042] [val/loss_avg_len_512: 4.474] [val/perplexity_len_512: 87.673]
268
+ [2025-10-11 19:51:36][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001468006400.pt...
269
+ [2025-10-11 19:51:36][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001468006400.pt.
270
+ [2025-10-11 19:51:36][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.452]
271
+ [2025-10-11 19:52:11][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 1:29:55] [ETA: 0:36:43] [loss: 4.427] [tokens/s: 272383.330] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
272
+ [2025-10-11 19:52:46][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 1:30:30] [ETA: 0:35:11] [loss: 4.429] [tokens/s: 305929.221] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
273
+ [2025-10-11 19:52:46][train:194][INFO] Running validation...
274
+ [2025-10-11 19:54:09][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 5430.250] [val/train_update_time: 2496.591] [val/loss: 4.419] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.169] [val/val_tokens_per_second: 492490.783] [val/loss_avg_len_2048: 4.419] [val/perplexity_len_2048: 83.027] [val/loss_avg_len_1024: 4.436] [val/perplexity_len_1024: 84.412] [val/loss_avg_len_512: 4.466] [val/perplexity_len_512: 87.030]
275
+ [2025-10-11 19:54:44][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 1:32:28] [ETA: 0:34:12] [loss: 4.433] [tokens/s: 272528.625] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
276
+ [2025-10-11 19:55:18][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 1:33:02] [ETA: 0:32:41] [loss: 4.431] [tokens/s: 306129.846] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
277
+ [2025-10-11 19:55:18][train:194][INFO] Running validation...
278
+ [2025-10-11 19:56:42][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 5582.925] [val/train_update_time: 2565.848] [val/loss: 4.412] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.229] [val/val_tokens_per_second: 492134.178] [val/loss_avg_len_2048: 4.412] [val/perplexity_len_2048: 82.464] [val/loss_avg_len_1024: 4.429] [val/perplexity_len_1024: 83.846] [val/loss_avg_len_512: 4.460] [val/perplexity_len_512: 86.447]
279
+ [2025-10-11 19:57:16][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 1:35:00] [ETA: 0:31:40] [loss: 4.417] [tokens/s: 272656.772] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
280
+ [2025-10-11 19:57:16][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5700.940] [train_eval/train_update_time: 2600.504] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.418] [train_eval/perplexity_len_2048: 82.893] [train_eval/loss_avg_len_1024: 4.433] [train_eval/perplexity_len_1024: 84.167] [train_eval/loss_avg_len_512: 4.462] [train_eval/perplexity_len_512: 86.650]
281
+ [2025-10-11 19:57:51][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 1:35:35] [ETA: 0:30:11] [loss: 4.382] [tokens/s: 306316.778] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
282
+ [2025-10-11 19:57:51][train:194][INFO] Running validation...
283
+ [2025-10-11 19:59:14][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 5735.700] [val/train_update_time: 2635.129] [val/loss: 4.407] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.274] [val/val_tokens_per_second: 491871.471] [val/loss_avg_len_2048: 4.407] [val/perplexity_len_2048: 82.010] [val/loss_avg_len_1024: 4.423] [val/perplexity_len_1024: 83.385] [val/loss_avg_len_512: 4.454] [val/perplexity_len_512: 85.971]
284
+ [2025-10-11 19:59:49][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 1:37:33] [ETA: 0:29:08] [loss: 4.442] [tokens/s: 272801.241] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
285
+ [2025-10-11 20:00:24][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 1:38:08] [ETA: 0:27:40] [loss: 4.363] [tokens/s: 306495.358] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
286
+ [2025-10-11 20:00:24][train:194][INFO] Running validation...
287
+ [2025-10-11 20:01:48][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 5888.469] [val/train_update_time: 2704.364] [val/loss: 4.402] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.732] [val/val_tokens_per_second: 489182.490] [val/loss_avg_len_2048: 4.402] [val/perplexity_len_2048: 81.604] [val/loss_avg_len_1024: 4.419] [val/perplexity_len_1024: 82.980] [val/loss_avg_len_512: 4.449] [val/perplexity_len_512: 85.571]
288
+ [2025-10-11 20:02:22][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 1:40:06] [ETA: 0:26:36] [loss: 4.402] [tokens/s: 272781.991] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
289
+ [2025-10-11 20:02:57][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 1:40:41] [ETA: 0:25:10] [loss: 4.374] [tokens/s: 306462.590] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
290
+ [2025-10-11 20:02:57][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6041.680] [train_eval/train_update_time: 2773.588] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.403] [train_eval/perplexity_len_2048: 81.721] [train_eval/loss_avg_len_1024: 4.419] [train_eval/perplexity_len_1024: 83.053] [train_eval/loss_avg_len_512: 4.448] [train_eval/perplexity_len_512: 85.471]
291
+ [2025-10-11 20:02:57][train:194][INFO] Running validation...
292
+ [2025-10-11 20:04:20][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 6041.680] [val/train_update_time: 2773.588] [val/loss: 4.398] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.158] [val/val_tokens_per_second: 492554.852] [val/loss_avg_len_2048: 4.398] [val/perplexity_len_2048: 81.271] [val/loss_avg_len_1024: 4.414] [val/perplexity_len_1024: 82.636] [val/loss_avg_len_512: 4.445] [val/perplexity_len_512: 85.213]
293
+ [2025-10-11 20:04:20][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001677721600.pt...
294
+ [2025-10-11 20:04:21][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001677721600.pt.
295
+ [2025-10-11 20:04:21][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.451]
296
+ [2025-10-11 20:04:55][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 1:42:40] [ETA: 0:24:04] [loss: 4.353] [tokens/s: 272796.902] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
297
+ [2025-10-11 20:05:30][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 1:43:14] [ETA: 0:22:39] [loss: 4.367] [tokens/s: 306276.764] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
298
+ [2025-10-11 20:05:30][train:194][INFO] Running validation...
299
+ [2025-10-11 20:06:53][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 6194.783] [val/train_update_time: 2842.821] [val/loss: 4.394] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.194] [val/val_tokens_per_second: 492344.641] [val/loss_avg_len_2048: 4.394] [val/perplexity_len_2048: 80.953] [val/loss_avg_len_1024: 4.411] [val/perplexity_len_1024: 82.322] [val/loss_avg_len_512: 4.442] [val/perplexity_len_512: 84.903]
300
+ [2025-10-11 20:07:28][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 1:45:12] [ETA: 0:21:32] [loss: 4.391] [tokens/s: 272801.266] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
301
+ [2025-10-11 20:08:03][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 1:45:47] [ETA: 0:20:09] [loss: 4.364] [tokens/s: 306302.849] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
302
+ [2025-10-11 20:08:03][train:194][INFO] Running validation...
303
+ [2025-10-11 20:09:26][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 6347.454] [val/train_update_time: 2912.047] [val/loss: 4.391] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.356] [val/val_tokens_per_second: 491386.983] [val/loss_avg_len_2048: 4.391] [val/perplexity_len_2048: 80.689] [val/loss_avg_len_1024: 4.407] [val/perplexity_len_1024: 82.060] [val/loss_avg_len_512: 4.438] [val/perplexity_len_512: 84.631]
304
+ [2025-10-11 20:10:01][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 1:47:45] [ETA: 0:19:00] [loss: 4.427] [tokens/s: 272772.712] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
305
+ [2025-10-11 20:10:01][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6465.570] [train_eval/train_update_time: 2946.677] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.388] [train_eval/perplexity_len_2048: 80.469] [train_eval/loss_avg_len_1024: 4.399] [train_eval/perplexity_len_1024: 81.375] [train_eval/loss_avg_len_512: 4.428] [train_eval/perplexity_len_512: 83.760]
306
+ [2025-10-11 20:10:36][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 1:48:20] [ETA: 0:17:38] [loss: 4.397] [tokens/s: 306283.750] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
307
+ [2025-10-11 20:10:36][train:194][INFO] Running validation...
308
+ [2025-10-11 20:11:59][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 6500.322] [val/train_update_time: 2981.297] [val/loss: 4.388] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.424] [val/val_tokens_per_second: 490983.080] [val/loss_avg_len_2048: 4.388] [val/perplexity_len_2048: 80.493] [val/loss_avg_len_1024: 4.405] [val/perplexity_len_1024: 81.864] [val/loss_avg_len_512: 4.436] [val/perplexity_len_512: 84.436]
309
+ [2025-10-11 20:12:34][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 1:50:18] [ETA: 0:16:28] [loss: 4.349] [tokens/s: 272720.654] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
310
+ [2025-10-11 20:13:09][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 1:50:53] [ETA: 0:15:07] [loss: 4.385] [tokens/s: 306431.128] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
311
+ [2025-10-11 20:13:09][train:194][INFO] Running validation...
312
+ [2025-10-11 20:14:33][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 6653.225] [val/train_update_time: 3050.524] [val/loss: 4.386] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.962] [val/val_tokens_per_second: 487840.726] [val/loss_avg_len_2048: 4.386] [val/perplexity_len_2048: 80.322] [val/loss_avg_len_1024: 4.403] [val/perplexity_len_1024: 81.687] [val/loss_avg_len_512: 4.434] [val/perplexity_len_512: 84.250]
313
+ [2025-10-11 20:15:07][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 1:52:51] [ETA: 0:13:56] [loss: 4.421] [tokens/s: 272631.733] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
314
+ [2025-10-11 20:15:42][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 1:53:26] [ETA: 0:12:36] [loss: 4.349] [tokens/s: 306244.747] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
315
+ [2025-10-11 20:15:42][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6806.725] [train_eval/train_update_time: 3119.803] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.387] [train_eval/perplexity_len_2048: 80.438] [train_eval/loss_avg_len_1024: 4.402] [train_eval/perplexity_len_1024: 81.618] [train_eval/loss_avg_len_512: 4.431] [train_eval/perplexity_len_512: 84.016]
316
+ [2025-10-11 20:15:42][train:194][INFO] Running validation...
317
+ [2025-10-11 20:17:06][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 6806.725] [val/train_update_time: 3119.803] [val/loss: 4.384] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.919] [val/val_tokens_per_second: 488087.212] [val/loss_avg_len_2048: 4.384] [val/perplexity_len_2048: 80.191] [val/loss_avg_len_1024: 4.401] [val/perplexity_len_1024: 81.559] [val/loss_avg_len_512: 4.432] [val/perplexity_len_512: 84.126]
318
+ [2025-10-11 20:17:06][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001887436800.pt...
319
+ [2025-10-11 20:17:07][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/transformer_2_4_256/checkpoints/step-000001887436800.pt.
320
+ [2025-10-11 20:17:07][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.446]
321
+ [2025-10-11 20:17:41][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 1:55:25] [ETA: 0:11:24] [loss: 4.378] [tokens/s: 272346.234] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
322
+ [2025-10-11 20:18:16][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 1:56:00] [ETA: 0:10:05] [loss: 4.415] [tokens/s: 305712.940] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
323
+ [2025-10-11 20:18:16][train:194][INFO] Running validation...
324
+ [2025-10-11 20:19:40][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 6960.597] [val/train_update_time: 3189.048] [val/loss: 4.383] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.851] [val/val_tokens_per_second: 488483.649] [val/loss_avg_len_2048: 4.383] [val/perplexity_len_2048: 80.099] [val/loss_avg_len_1024: 4.400] [val/perplexity_len_1024: 81.460] [val/loss_avg_len_512: 4.431] [val/perplexity_len_512: 84.019]
325
+ [2025-10-11 20:20:15][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 1:57:59] [ETA: 0:08:52] [loss: 4.391] [tokens/s: 272110.105] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
326
+ [2025-10-11 20:20:49][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 1:58:33] [ETA: 0:07:34] [loss: 4.346] [tokens/s: 305483.227] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
327
+ [2025-10-11 20:20:49][train:194][INFO] Running validation...
328
+ [2025-10-11 20:22:13][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 7113.937] [val/train_update_time: 3258.286] [val/loss: 4.382] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.208] [val/val_tokens_per_second: 492257.510] [val/loss_avg_len_2048: 4.382] [val/perplexity_len_2048: 80.033] [val/loss_avg_len_1024: 4.399] [val/perplexity_len_1024: 81.396] [val/loss_avg_len_512: 4.430] [val/perplexity_len_512: 83.957]
329
+ [2025-10-11 20:22:47][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 2:00:31] [ETA: 0:06:20] [loss: 4.382] [tokens/s: 272162.913] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
330
+ [2025-10-11 20:22:47][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7231.903] [train_eval/train_update_time: 3292.912] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.379] [train_eval/perplexity_len_2048: 79.785] [train_eval/loss_avg_len_1024: 4.396] [train_eval/perplexity_len_1024: 81.118] [train_eval/loss_avg_len_512: 4.424] [train_eval/perplexity_len_512: 83.395]
331
+ [2025-10-11 20:23:22][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 2:01:06] [ETA: 0:05:02] [loss: 4.385] [tokens/s: 305573.019] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
332
+ [2025-10-11 20:23:22][train:194][INFO] Running validation...
333
+ [2025-10-11 20:24:46][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 7266.676] [val/train_update_time: 3327.552] [val/loss: 4.382] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 83.397] [val/val_tokens_per_second: 491146.265] [val/loss_avg_len_2048: 4.382] [val/perplexity_len_2048: 79.997] [val/loss_avg_len_1024: 4.399] [val/perplexity_len_1024: 81.359] [val/loss_avg_len_512: 4.430] [val/perplexity_len_512: 83.920]
334
+ [2025-10-11 20:25:20][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 2:03:04] [ETA: 0:03:48] [loss: 4.396] [tokens/s: 272172.238] [batches/s: 0.130] [MFU: 0.000] [TFLOPS: 0.000]
335
+ [2025-10-11 20:25:55][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 2:03:39] [ETA: 0:02:31] [loss: 4.376] [tokens/s: 305840.910] [batches/s: 0.146] [MFU: 0.000] [TFLOPS: 0.000]
336
+ [2025-10-11 20:25:55][train:194][INFO] Running validation...
337
+ [2025-10-11 20:27:19][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 7419.519] [val/train_update_time: 3396.798] [val/loss: 4.382] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 84.220] [val/val_tokens_per_second: 486346.384] [val/loss_avg_len_2048: 4.382] [val/perplexity_len_2048: 79.981] [val/loss_avg_len_1024: 4.399] [val/perplexity_len_1024: 81.344] [val/loss_avg_len_512: 4.430] [val/perplexity_len_512: 83.905]
338
+ [2025-10-11 20:27:19][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl CHANGED
@@ -1,10 +1,9 @@
1
- {"step": 209715200, "checkpoint/checkpoint_time": 0.5068966029211879}
2
- {"step": 419430400, "checkpoint/checkpoint_time": 0.4977876110933721}
3
- {"step": 629145600, "checkpoint/checkpoint_time": 0.501962305046618}
4
- {"step": 838860800, "checkpoint/checkpoint_time": 0.5075914999470115}
5
- {"step": 1048576000, "checkpoint/checkpoint_time": 0.49702629493549466}
6
- {"step": 1258291200, "checkpoint/checkpoint_time": 0.5112055451609194}
7
- {"step": 1468006400, "checkpoint/checkpoint_time": 0.49192168982699513}
8
- {"step": 1677721600, "checkpoint/checkpoint_time": 0.5073983632028103}
9
- {"step": 1887436800, "checkpoint/checkpoint_time": 0.509603947866708}
10
- {"step": 2097152000, "checkpoint/checkpoint_time": 0.5034480569884181}
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 0.44675078699947335}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 0.4361007340194192}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 0.43971711499034427}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 0.44477320901933126}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 0.4476856429828331}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 0.4449735890084412}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 0.45186209797975607}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 0.45097879599779844}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 0.44572298499406315}
 
metrics/jsonlines/norm.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/throughput.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl CHANGED
@@ -1,100 +1,98 @@
1
- {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 40.415581199806184, "train/update_time": 40.130518121644855, "train/lr": 0.0009000000000000001, "train/loss": 9.772826194763184, "train/global_grad_norm": 1.2391878366470337}
2
- {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 75.34514997294173, "train/update_time": 74.81576991919428, "train/lr": 0.0009997960964140947, "train/loss": 8.152568817138672, "train/global_grad_norm": 0.9624242186546326}
3
- {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 116.51147756492719, "train/update_time": 109.5343208857812, "train/lr": 0.0009990914580222257, "train/loss": 7.613849639892578, "train/global_grad_norm": 0.5937998294830322}
4
- {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 151.4883820596151, "train/update_time": 144.2230770494789, "train/lr": 0.0009978842768382998, "train/loss": 7.2865376472473145, "train/global_grad_norm": 0.3504825830459595}
5
- {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 192.62442071689293, "train/update_time": 178.92916662013158, "train/lr": 0.0009961757683914405, "train/loss": 7.059178352355957, "train/global_grad_norm": 0.4386540949344635}
6
- {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 227.60141262365505, "train/update_time": 213.61536543117836, "train/lr": 0.00099396765300483, "train/loss": 6.845610618591309, "train/global_grad_norm": 0.6638095378875732}
7
- {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 268.7769126538187, "train/update_time": 248.33218330098316, "train/lr": 0.0009912621540634887, "train/loss": 6.622528076171875, "train/global_grad_norm": 0.8688696622848511}
8
- {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 303.7567360489629, "train/update_time": 283.02572092972696, "train/lr": 0.000988061995775515, "train/loss": 6.432361125946045, "train/global_grad_norm": 0.6051720976829529}
9
- {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 344.90547194099054, "train/update_time": 317.73537164507434, "train/lr": 0.0009843704004290394, "train/loss": 6.284864902496338, "train/global_grad_norm": 0.8888136148452759}
10
- {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 379.894994311966, "train/update_time": 352.43726607831195, "train/lr": 0.0009801910851476522, "train/loss": 6.139675617218018, "train/global_grad_norm": 1.0011965036392212}
11
- {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 421.5841581178829, "train/update_time": 387.1666129701771, "train/lr": 0.0009755282581475768, "train/loss": 6.016983985900879, "train/global_grad_norm": 0.6383078098297119}
12
- {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 456.560782736633, "train/update_time": 421.85785795981064, "train/lr": 0.0009703866145003512, "train/loss": 5.922806262969971, "train/global_grad_norm": 0.841997504234314}
13
- {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 497.7341962759383, "train/update_time": 456.5605332814157, "train/lr": 0.0009647713314052896, "train/loss": 5.831605434417725, "train/global_grad_norm": 0.8896751999855042}
14
- {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 532.7168368049897, "train/update_time": 491.2538420544006, "train/lr": 0.0009586880629764817, "train/loss": 5.731290340423584, "train/global_grad_norm": 0.8051401972770691}
15
- {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 573.9008510387503, "train/update_time": 525.9637654465623, "train/lr": 0.0009521429345495787, "train/loss": 5.613372802734375, "train/global_grad_norm": 1.0643665790557861}
16
- {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 608.8816099567339, "train/update_time": 560.655514428392, "train/lr": 0.0009451425365140996, "train/loss": 5.497401237487793, "train/global_grad_norm": 0.9499395489692688}
17
- {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 650.1146287927404, "train/update_time": 595.3849068158306, "train/lr": 0.000937693917677468, "train/loss": 5.487479209899902, "train/global_grad_norm": 1.2670619487762451}
18
- {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 685.1029845466837, "train/update_time": 630.085222561378, "train/lr": 0.0009298045781674596, "train/loss": 5.432429313659668, "train/global_grad_norm": 0.8213033080101013}
19
- {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 726.2910113739781, "train/update_time": 664.8161856215447, "train/lr": 0.0009214824618802108, "train/loss": 5.315907955169678, "train/global_grad_norm": 1.1873756647109985}
20
- {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 761.2827736386098, "train/update_time": 699.5182051258162, "train/lr": 0.000912735948481387, "train/loss": 5.304999351501465, "train/global_grad_norm": 0.8800298571586609}
21
- {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 802.9843285907991, "train/update_time": 734.2493304978125, "train/lr": 0.0009035738449685707, "train/loss": 5.259337425231934, "train/global_grad_norm": 0.9146267175674438}
22
- {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 837.9679110529833, "train/update_time": 768.9465256030671, "train/lr": 0.0008940053768033609, "train/loss": 5.177842617034912, "train/global_grad_norm": 1.1856276988983154}
23
- {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 879.212199519854, "train/update_time": 803.6772545911372, "train/lr": 0.0008840401786221159, "train/loss": 5.165072917938232, "train/global_grad_norm": 0.9780120849609375}
24
- {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 914.1905289897695, "train/update_time": 838.3647430227138, "train/lr": 0.0008736882845346905, "train/loss": 5.109606742858887, "train/global_grad_norm": 1.1500730514526367}
25
- {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 955.4245519889519, "train/update_time": 873.0867990762927, "train/lr": 0.0008629601180209381, "train/loss": 5.07704496383667, "train/global_grad_norm": 1.0051029920578003}
26
- {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 990.4083167887293, "train/update_time": 907.7796153570525, "train/lr": 0.0008518664814351503, "train/loss": 5.049190998077393, "train/global_grad_norm": 1.2131925821304321}
27
- {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 1031.538414828945, "train/update_time": 942.4896042570472, "train/lr": 0.0008404185451290017, "train/loss": 4.9988112449646, "train/global_grad_norm": 0.8757486343383789}
28
- {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 1066.5353656006046, "train/update_time": 977.1970872837119, "train/lr": 0.0008286278362039527, "train/loss": 4.967507839202881, "train/global_grad_norm": 1.0268068313598633}
29
- {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 1107.6705617727712, "train/update_time": 1011.9203152623959, "train/lr": 0.0008165062269044352, "train/loss": 4.928158283233643, "train/global_grad_norm": 0.952899158000946}
30
- {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 1142.651667741593, "train/update_time": 1046.612968060188, "train/lr": 0.0008040659226635089, "train/loss": 4.900204181671143, "train/global_grad_norm": 1.119959831237793}
31
- {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 1184.3485312736593, "train/update_time": 1081.3357461206615, "train/lr": 0.0007913194498130252, "train/loss": 4.892436504364014, "train/global_grad_norm": 0.9509796500205994}
32
- {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 1219.3308540079743, "train/update_time": 1116.0336452010088, "train/lr": 0.000778279642970672, "train/loss": 4.856998920440674, "train/global_grad_norm": 0.8461491465568542}
33
- {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 1260.5213517337106, "train/update_time": 1150.7654260639101, "train/lr": 0.0007649596321166025, "train/loss": 4.824680805206299, "train/global_grad_norm": 0.7761072516441345}
34
- {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 1295.5154801327735, "train/update_time": 1185.4742206893861, "train/lr": 0.0007513728293726579, "train/loss": 4.811212062835693, "train/global_grad_norm": 1.251476526260376}
35
- {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 1336.7052892716601, "train/update_time": 1220.2067804792896, "train/lr": 0.0007375329154974975, "train/loss": 4.805399417877197, "train/global_grad_norm": 0.8441694974899292}
36
- {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 1371.6966382889077, "train/update_time": 1254.908473377116, "train/lr": 0.0007234538261112341, "train/loss": 4.7209577560424805, "train/global_grad_norm": 0.9580700993537903}
37
- {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 1412.8641826817766, "train/update_time": 1289.631338949781, "train/lr": 0.0007091497376634464, "train/loss": 4.706777095794678, "train/global_grad_norm": 0.8831941485404968}
38
- {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 1447.851369981654, "train/update_time": 1324.3309777188115, "train/lr": 0.0006946350531586958, "train/loss": 4.72338342666626, "train/global_grad_norm": 0.9644153118133545}
39
- {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 1489.0743115595542, "train/update_time": 1359.0581260472536, "train/lr": 0.0006799243876539214, "train/loss": 4.691098690032959, "train/global_grad_norm": 0.7392861247062683}
40
- {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 1524.0658300309442, "train/update_time": 1393.7609866121784, "train/lr": 0.0006650325535423166, "train/loss": 4.717495918273926, "train/global_grad_norm": 1.1011825799942017}
41
- {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 1565.793813650962, "train/update_time": 1428.5169904893264, "train/lr": 0.0006499745456385053, "train/loss": 4.677796840667725, "train/global_grad_norm": 0.8885034918785095}
42
- {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 1600.7871080269106, "train/update_time": 1463.2190964017063, "train/lr": 0.0006347655260800339, "train/loss": 4.622861862182617, "train/global_grad_norm": 0.7021073698997498}
43
- {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 1642.045855066739, "train/update_time": 1497.9514221656136, "train/lr": 0.0006194208090603844, "train/loss": 4.662997245788574, "train/global_grad_norm": 1.020289659500122}
44
- {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 1677.0360736018047, "train/update_time": 1532.649858857505, "train/lr": 0.0006039558454088796, "train/loss": 4.641533851623535, "train/global_grad_norm": 0.9713541269302368}
45
- {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 1718.264860745985, "train/update_time": 1567.3857053546235, "train/lr": 0.0005883862070330078, "train/loss": 4.644475936889648, "train/global_grad_norm": 0.7124711275100708}
46
- {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 1753.2634326359257, "train/update_time": 1602.0879544354975, "train/lr": 0.0005727275712388317, "train/loss": 4.607753753662109, "train/global_grad_norm": 0.929435670375824}
47
- {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 1794.4538897527382, "train/update_time": 1636.81822361378, "train/lr": 0.0005569957049452703, "train/loss": 4.577797889709473, "train/global_grad_norm": 0.9655690789222717}
48
- {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 1829.5858452958055, "train/update_time": 1671.6610745475627, "train/lr": 0.0005412064488081482, "train/loss": 4.570688247680664, "train/global_grad_norm": 0.8964301943778992}
49
- {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 1870.759945500642, "train/update_time": 1706.3805879675783, "train/lr": 0.0005253757012699972, "train/loss": 4.566349983215332, "train/global_grad_norm": 0.8143415451049805}
50
- {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 1905.7458327147178, "train/update_time": 1741.0785799045116, "train/lr": 0.0005095194025516734, "train/loss": 4.573827743530273, "train/global_grad_norm": 0.830761730670929}
51
- {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 1947.430106294807, "train/update_time": 1775.8004680471495, "train/lr": 0.0004936535186019053, "train/loss": 4.569893836975098, "train/global_grad_norm": 1.0050759315490723}
52
- {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 1982.4242434157059, "train/update_time": 1810.507148906123, "train/lr": 0.00047779402502093696, "train/loss": 4.513689994812012, "train/global_grad_norm": 0.7226787805557251}
53
- {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 2023.6099294535816, "train/update_time": 1845.236276451964, "train/lr": 0.0004619568909744525, "train/loss": 4.529908180236816, "train/global_grad_norm": 0.7668783068656921}
54
- {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 2058.594140216708, "train/update_time": 1879.9355728020892, "train/lr": 0.00044615806311398067, "train/loss": 4.540687561035156, "train/global_grad_norm": 0.6153253316879272}
55
- {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 2099.768523535691, "train/update_time": 1914.6644405797124, "train/lr": 0.0004304134495199673, "train/loss": 4.496268272399902, "train/global_grad_norm": 0.6364275217056274}
56
- {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 2134.75423188461, "train/update_time": 1949.358494934626, "train/lr": 0.0004147389036836882, "train/loss": 4.4821457862854, "train/global_grad_norm": 0.6262201070785522}
57
- {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 2175.9149562329985, "train/update_time": 1984.07279653335, "train/lr": 0.0003991502085441259, "train/loss": 4.491011619567871, "train/global_grad_norm": 0.5450102090835571}
58
- {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 2210.907703721896, "train/update_time": 2018.7735858242959, "train/lr": 0.0003836630605958888, "train/loss": 4.503609657287598, "train/global_grad_norm": 0.8031578660011292}
59
- {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 2252.084350648802, "train/update_time": 2053.4985836916603, "train/lr": 0.00036829305408417155, "train/loss": 4.468899726867676, "train/global_grad_norm": 0.5556333661079407}
60
- {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 2287.0779132805765, "train/update_time": 2088.2027000347152, "train/lr": 0.000353055665302672, "train/loss": 4.487270355224609, "train/global_grad_norm": 0.5684188604354858}
61
- {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 2328.7814065408893, "train/update_time": 2122.9348065350205, "train/lr": 0.0003379662370102746, "train/loss": 4.495736598968506, "train/global_grad_norm": 0.5331183671951294}
62
- {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 2363.7743464875966, "train/update_time": 2157.6409200155176, "train/lr": 0.00032303996298219405, "train/loss": 4.464653491973877, "train/global_grad_norm": 0.8254384994506836}
63
- {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 2404.9524343889207, "train/update_time": 2192.35531153623, "train/lr": 0.00030829187271113034, "train/loss": 4.429670810699463, "train/global_grad_norm": 0.6143578886985779}
64
- {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 2439.9411435807124, "train/update_time": 2227.0531149646267, "train/lr": 0.0002937368162738445, "train/loss": 4.5008721351623535, "train/global_grad_norm": 0.5588297843933105}
65
- {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 2481.136555264704, "train/update_time": 2261.7757655889727, "train/lr": 0.0002793894493783894, "train/loss": 4.481244087219238, "train/global_grad_norm": 0.5616574287414551}
66
- {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 2516.129153979942, "train/update_time": 2296.475023902487, "train/lr": 0.00026526421860705474, "train/loss": 4.4359331130981445, "train/global_grad_norm": 0.49746274948120117}
67
- {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 2557.309430932626, "train/update_time": 2331.2063457700424, "train/lr": 0.0002513753468698824, "train/loss": 4.433716773986816, "train/global_grad_norm": 0.49925321340560913}
68
- {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 2592.2952946918085, "train/update_time": 2365.9068522513844, "train/lr": 0.00023773681908340283, "train/loss": 4.40523099899292, "train/global_grad_norm": 0.5226632952690125}
69
- {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 2633.4475547745824, "train/update_time": 2400.6202873536386, "train/lr": 0.00022436236808900823, "train/loss": 4.45224666595459, "train/global_grad_norm": 0.5822315812110901}
70
- {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 2668.4236754849553, "train/update_time": 2435.308184158057, "train/lr": 0.00021126546082514682, "train/loss": 4.45149564743042, "train/global_grad_norm": 0.47449296712875366}
71
- {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 2710.136317596771, "train/update_time": 2470.054123024922, "train/lr": 0.00019845928476725522, "train/loss": 4.4132819175720215, "train/global_grad_norm": 0.5319316387176514}
72
- {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 2745.12359233154, "train/update_time": 2504.7536877635866, "train/lr": 0.0001859567346490913, "train/loss": 4.4424920082092285, "train/global_grad_norm": 0.46108055114746094}
73
- {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 2786.2917791469954, "train/update_time": 2539.464351078961, "train/lr": 0.00017377039947882782, "train/loss": 4.440863609313965, "train/global_grad_norm": 0.4736610949039459}
74
- {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 2821.2778452136554, "train/update_time": 2574.1644478179514, "train/lr": 0.00016191254986299043, "train/loss": 4.419039249420166, "train/global_grad_norm": 0.4840933382511139}
75
- {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 2862.4817315279506, "train/update_time": 2608.8764487369917, "train/lr": 0.00015039512565099468, "train/loss": 4.444186687469482, "train/global_grad_norm": 0.4269809126853943}
76
- {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 2897.4854577975348, "train/update_time": 2643.5932636465877, "train/lr": 0.00013922972391273224, "train/loss": 4.392973899841309, "train/global_grad_norm": 0.428153932094574}
77
- {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 2938.698933465872, "train/update_time": 2678.3161373827606, "train/lr": 0.00012842758726130281, "train/loss": 4.4127116203308105, "train/global_grad_norm": 0.4216243624687195}
78
- {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 2973.6870251647197, "train/update_time": 2713.0167279425077, "train/lr": 0.00011799959253265679, "train/loss": 4.384042263031006, "train/global_grad_norm": 0.4444551169872284}
79
- {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 3014.889837917872, "train/update_time": 2747.7415770972148, "train/lr": 0.00010795623983354214, "train/loss": 4.410556793212891, "train/global_grad_norm": 0.40097400546073914}
80
- {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 3049.8850160585716, "train/update_time": 2782.4521241658367, "train/lr": 9.830764196878872e-05, "train/loss": 4.3876471519470215, "train/global_grad_norm": 0.3687848448753357}
81
- {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 3091.569239983801, "train/update_time": 2817.1740981261246, "train/lr": 8.906351425856951e-05, "train/loss": 4.433506488800049, "train/global_grad_norm": 0.36067357659339905}
82
- {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 3126.5600167936645, "train/update_time": 2851.8765539969318, "train/lr": 8.02331647558977e-05, "train/loss": 4.405129432678223, "train/global_grad_norm": 0.3341665267944336}
83
- {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 3167.7519225156866, "train/update_time": 2886.596782598179, "train/lr": 7.182548487420554e-05, "train/loss": 4.428231239318848, "train/global_grad_norm": 0.3340260088443756}
84
- {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 3202.7426801156253, "train/update_time": 2921.299470563419, "train/lr": 6.384894043444556e-05, "train/loss": 4.434932708740234, "train/global_grad_norm": 0.3256409764289856}
85
- {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 3243.933703172952, "train/update_time": 2956.025475362316, "train/lr": 5.6311563140726166e-05, "train/loss": 4.3828253746032715, "train/global_grad_norm": 0.32974866032600403}
86
- {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 3278.91815946484, "train/update_time": 2990.718675683718, "train/lr": 4.922094249306547e-05, "train/loss": 4.348251819610596, "train/global_grad_norm": 0.3138902485370636}
87
- {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 3320.089195527602, "train/update_time": 3025.4415654302575, "train/lr": 4.2584218145409916e-05, "train/loss": 4.42707633972168, "train/global_grad_norm": 0.30230483412742615}
88
- {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 3355.086739927996, "train/update_time": 3060.151473065838, "train/lr": 3.6408072716606236e-05, "train/loss": 4.3467206954956055, "train/global_grad_norm": 0.31371477246284485}
89
- {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 3396.2516544437967, "train/update_time": 3094.865460576024, "train/lr": 3.069872506157217e-05, "train/loss": 4.3787126541137695, "train/global_grad_norm": 0.30077674984931946}
90
- {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 3431.2398387999274, "train/update_time": 3129.5652695768513, "train/lr": 2.5461924009435368e-05, "train/loss": 4.3460774421691895, "train/global_grad_norm": 0.2794691026210785}
91
- {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 3472.951562038623, "train/update_time": 3164.3004304990172, "train/lr": 2.0702942574950812e-05, "train/loss": 4.386535167694092, "train/global_grad_norm": 0.268690288066864}
92
- {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 3507.942943904549, "train/update_time": 3199.003928860184, "train/lr": 1.642657264902142e-05, "train/loss": 4.388347148895264, "train/global_grad_norm": 0.29469677805900574}
93
- {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 3549.1294620479457, "train/update_time": 3233.7340904702432, "train/lr": 1.2637120173670358e-05, "train/loss": 4.373073101043701, "train/global_grad_norm": 0.29699528217315674}
94
- {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 3584.1315822978504, "train/update_time": 3268.4432333246805, "train/lr": 9.338400806321978e-06, "train/loss": 4.398288726806641, "train/global_grad_norm": 0.2372455894947052}
95
- {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 3625.2997248456813, "train/update_time": 3303.175303027965, "train/lr": 6.533736077758867e-06, "train/loss": 4.377623558044434, "train/global_grad_norm": 0.256507933139801}
96
- {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 3660.2930417479947, "train/update_time": 3337.873345853295, "train/lr": 4.2259500476214406e-06, "train/loss": 4.373324394226074, "train/global_grad_norm": 0.23570659756660461}
97
- {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 3701.4854015167803, "train/update_time": 3372.612820140552, "train/lr": 2.417366460819359e-06, "train/loss": 4.369635581970215, "train/global_grad_norm": 0.2412668913602829}
98
- {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 3736.479068840854, "train/update_time": 3407.321344117634, "train/lr": 1.1098064077174619e-06, "train/loss": 4.4287004470825195, "train/global_grad_norm": 0.24484556913375854}
99
- {"step": 2076180480, "train/token_count": 2076180480, "train/batch_count": 990, "train/flop_count": 0, "train/total_time": 3777.7033449816518, "train/update_time": 3442.088609050028, "train/lr": 3.0458649045211895e-07, "train/loss": 4.342052936553955, "train/global_grad_norm": 0.22195476293563843}
100
- {"step": 2097152000, "train/token_count": 2097152000, "train/batch_count": 1000, "train/flop_count": 0, "train/total_time": 3812.7206143867224, "train/update_time": 3476.8136685648933, "train/lr": 2.517497224463483e-09, "train/loss": 4.377318382263184, "train/global_grad_norm": 0.22657278180122375}
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 38.55579723298433, "train/update_time": 38.35399063504883, "train/lr": 0.0009000000000000001, "train/loss": 9.772618293762207, "train/global_grad_norm": 1.2446180582046509}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 73.29413136799121, "train/update_time": 72.96587482298492, "train/lr": 0.0009997960964140947, "train/loss": 8.16772174835205, "train/global_grad_norm": 0.9763324856758118}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 191.7932936270081, "train/update_time": 107.60015929004294, "train/lr": 0.0009990914580222257, "train/loss": 7.603027820587158, "train/global_grad_norm": 0.5199674367904663}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 226.53613719300483, "train/update_time": 142.21563291802886, "train/lr": 0.0009978842768382998, "train/loss": 7.273867607116699, "train/global_grad_norm": 0.6734169721603394}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 345.1667736689851, "train/update_time": 176.83618369704345, "train/lr": 0.0009961757683914405, "train/loss": 7.0204386711120605, "train/global_grad_norm": 0.7655511498451233}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 379.9190621979942, "train/update_time": 211.44736236400786, "train/lr": 0.00099396765300483, "train/loss": 6.766915321350098, "train/global_grad_norm": 0.30172044038772583}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 498.6147063600074, "train/update_time": 246.0684660130646, "train/lr": 0.0009912621540634887, "train/loss": 6.6051344871521, "train/global_grad_norm": 0.827389121055603}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 533.3516867249855, "train/update_time": 280.6721391470637, "train/lr": 0.000988061995775515, "train/loss": 6.440863132476807, "train/global_grad_norm": 1.4310520887374878}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 651.8656425169902, "train/update_time": 315.29572922710213, "train/lr": 0.0009843704004290394, "train/loss": 6.236248970031738, "train/global_grad_norm": 0.6141589283943176}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 686.6137485890067, "train/update_time": 349.9022945231409, "train/lr": 0.0009801910851476522, "train/loss": 6.1219682693481445, "train/global_grad_norm": 1.1012554168701172}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 805.6500484560092, "train/update_time": 384.52516269215266, "train/lr": 0.0009755282581475768, "train/loss": 5.997668266296387, "train/global_grad_norm": 0.7333498597145081}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 840.405497522006, "train/update_time": 419.1452622152283, "train/lr": 0.0009703866145003512, "train/loss": 5.855091571807861, "train/global_grad_norm": 0.6099743247032166}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 959.1070550300064, "train/update_time": 453.81502685521264, "train/lr": 0.0009647713314052896, "train/loss": 5.792905330657959, "train/global_grad_norm": 0.7265439629554749}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 993.8301178629918, "train/update_time": 488.4094896201277, "train/lr": 0.0009586880629764817, "train/loss": 5.712881088256836, "train/global_grad_norm": 0.9564601182937622}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1111.7393740309926, "train/update_time": 523.0301243920985, "train/lr": 0.0009521429345495787, "train/loss": 5.5812788009643555, "train/global_grad_norm": 0.6320960521697998}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1146.4763450330065, "train/update_time": 557.6233584931178, "train/lr": 0.0009451425365140996, "train/loss": 5.537780284881592, "train/global_grad_norm": 1.0332707166671753}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1264.3617091320048, "train/update_time": 592.2433790900977, "train/lr": 0.000937693917677468, "train/loss": 5.429712772369385, "train/global_grad_norm": 1.0364717245101929}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1299.0948956199863, "train/update_time": 626.843196902104, "train/lr": 0.0009298045781674596, "train/loss": 5.393295764923096, "train/global_grad_norm": 0.912891685962677}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1417.3040110229922, "train/update_time": 661.6429489921429, "train/lr": 0.0009214824618802108, "train/loss": 5.366145133972168, "train/global_grad_norm": 1.039188027381897}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1452.0386730069877, "train/update_time": 696.2396770050982, "train/lr": 0.000912735948481387, "train/loss": 5.265153884887695, "train/global_grad_norm": 0.851448655128479}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 1570.467384151998, "train/update_time": 730.8712040171376, "train/lr": 0.0009035738449685707, "train/loss": 5.228988170623779, "train/global_grad_norm": 1.3591398000717163}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 1605.2216201199917, "train/update_time": 765.4900175441289, "train/lr": 0.0008940053768033609, "train/loss": 5.1907196044921875, "train/global_grad_norm": 1.3637722730636597}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 1723.0966956730117, "train/update_time": 800.1077135011437, "train/lr": 0.0008840401786221159, "train/loss": 5.119633197784424, "train/global_grad_norm": 1.0941485166549683}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 1757.860138426011, "train/update_time": 834.7357104731782, "train/lr": 0.0008736882845346905, "train/loss": 5.059532642364502, "train/global_grad_norm": 0.9115816950798035}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 1875.7684185520047, "train/update_time": 869.3637212922331, "train/lr": 0.0008629601180209381, "train/loss": 5.0564866065979, "train/global_grad_norm": 0.9306532740592957}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 1910.5157937250042, "train/update_time": 903.9701039092615, "train/lr": 0.0008518664814351503, "train/loss": 5.0055952072143555, "train/global_grad_norm": 1.2287901639938354}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2028.4397037760064, "train/update_time": 938.5902288242651, "train/lr": 0.0008404185451290017, "train/loss": 4.980170249938965, "train/global_grad_norm": 0.8189780712127686}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2063.1963952730002, "train/update_time": 973.2086714253237, "train/lr": 0.0008286278362039527, "train/loss": 4.936854362487793, "train/global_grad_norm": 1.3344489336013794}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2181.072865936003, "train/update_time": 1007.8210269463598, "train/lr": 0.0008165062269044352, "train/loss": 4.8954973220825195, "train/global_grad_norm": 0.9711341261863708}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 2215.8320906269946, "train/update_time": 1042.4422394274152, "train/lr": 0.0008040659226635089, "train/loss": 4.872994422912598, "train/global_grad_norm": 1.2242389917373657}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 2334.1208405600046, "train/update_time": 1077.064158729394, "train/lr": 0.0007913194498130252, "train/loss": 4.887465476989746, "train/global_grad_norm": 1.0431791543960571}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 2368.8792440719844, "train/update_time": 1111.6830330224184, "train/lr": 0.000778279642970672, "train/loss": 4.8178582191467285, "train/global_grad_norm": 0.9318476319313049}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 2486.7425695779966, "train/update_time": 1146.306156033359, "train/lr": 0.0007649596321166025, "train/loss": 4.83030891418457, "train/global_grad_norm": 0.7958810329437256}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 2521.492483378999, "train/update_time": 1180.931988580327, "train/lr": 0.0007513728293726579, "train/loss": 4.7904372215271, "train/global_grad_norm": 0.9025549292564392}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 2639.4180885159876, "train/update_time": 1215.5536740703392, "train/lr": 0.0007375329154974975, "train/loss": 4.776428699493408, "train/global_grad_norm": 0.9148038625717163}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 2674.1708125350124, "train/update_time": 1250.16888089836, "train/lr": 0.0007234538261112341, "train/loss": 4.702690601348877, "train/global_grad_norm": 0.863443911075592}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 2792.2849394810037, "train/update_time": 1284.7876818493824, "train/lr": 0.0007091497376634464, "train/loss": 4.724478721618652, "train/global_grad_norm": 0.7980408668518066}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 2827.0179851859866, "train/update_time": 1319.3769752274675, "train/lr": 0.0006946350531586958, "train/loss": 4.699609756469727, "train/global_grad_norm": 0.7278746962547302}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 2945.1225905850006, "train/update_time": 1354.0406499354867, "train/lr": 0.0006799243876539214, "train/loss": 4.7111992835998535, "train/global_grad_norm": 1.1111260652542114}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 2979.819751534, "train/update_time": 1388.5897308025742, "train/lr": 0.0006650325535423166, "train/loss": 4.619029521942139, "train/global_grad_norm": 0.6627530455589294}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 3098.7063870650018, "train/update_time": 1423.2231051865965, "train/lr": 0.0006499745456385053, "train/loss": 4.639301300048828, "train/global_grad_norm": 0.7888188362121582}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 3133.447895410005, "train/update_time": 1457.8368608065357, "train/lr": 0.0006347655260800339, "train/loss": 4.638882160186768, "train/global_grad_norm": 0.7990217804908752}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 3252.0970968430047, "train/update_time": 1492.454597274569, "train/lr": 0.0006194208090603844, "train/loss": 4.6314873695373535, "train/global_grad_norm": 0.6108959913253784}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 3286.8489470180066, "train/update_time": 1527.068307258567, "train/lr": 0.0006039558454088796, "train/loss": 4.656696319580078, "train/global_grad_norm": 0.9010829329490662}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 3405.285965647985, "train/update_time": 1561.695067133638, "train/lr": 0.0005883862070330078, "train/loss": 4.604307651519775, "train/global_grad_norm": 0.7386929392814636}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 3440.121589771996, "train/update_time": 1596.3996783556067, "train/lr": 0.0005727275712388317, "train/loss": 4.573764324188232, "train/global_grad_norm": 0.7288416624069214}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 3557.970528264006, "train/update_time": 1631.0234597446106, "train/lr": 0.0005569957049452703, "train/loss": 4.603250026702881, "train/global_grad_norm": 0.7999058961868286}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 3592.7624590400083, "train/update_time": 1665.6897427196673, "train/lr": 0.0005412064488081482, "train/loss": 4.578767776489258, "train/global_grad_norm": 0.8067259192466736}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 3710.7506003380113, "train/update_time": 1700.3189989306557, "train/lr": 0.0005253757012699972, "train/loss": 4.573814868927002, "train/global_grad_norm": 0.7487918138504028}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 3745.514067212993, "train/update_time": 1734.9429978527187, "train/lr": 0.0005095194025516734, "train/loss": 4.552258491516113, "train/global_grad_norm": 0.9750821590423584}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 3864.004582177993, "train/update_time": 1769.5580677387188, "train/lr": 0.0004936535186019053, "train/loss": 4.5562920570373535, "train/global_grad_norm": 0.6866552233695984}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 3898.7206719240057, "train/update_time": 1804.14956625973, "train/lr": 0.00047779402502093696, "train/loss": 4.551402568817139, "train/global_grad_norm": 0.787746250629425}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 4016.6126302230114, "train/update_time": 1838.7692771856673, "train/lr": 0.0004619568909744525, "train/loss": 4.5086894035339355, "train/global_grad_norm": 0.8107492923736572}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 4051.3472030170087, "train/update_time": 1873.3693814776198, "train/lr": 0.00044615806311398067, "train/loss": 4.518401622772217, "train/global_grad_norm": 0.5787344574928284}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 4169.554661403992, "train/update_time": 1907.990586347587, "train/lr": 0.0004304134495199673, "train/loss": 4.477138042449951, "train/global_grad_norm": 0.7278950214385986}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 4204.282005440997, "train/update_time": 1942.5947593615856, "train/lr": 0.0004147389036836882, "train/loss": 4.5172038078308105, "train/global_grad_norm": 1.0125855207443237}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 4323.021760789008, "train/update_time": 1977.2172263615066, "train/lr": 0.0003991502085441259, "train/loss": 4.482260704040527, "train/global_grad_norm": 0.6572685241699219}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 4357.762582869007, "train/update_time": 2011.834883902513, "train/lr": 0.0003836630605958888, "train/loss": 4.521332740783691, "train/global_grad_norm": 0.5961792469024658}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 4475.897989705001, "train/update_time": 2046.4480303055025, "train/lr": 0.00036829305408417155, "train/loss": 4.499192714691162, "train/global_grad_norm": 0.6105663180351257}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 4510.643669799989, "train/update_time": 2081.064080615528, "train/lr": 0.000353055665302672, "train/loss": 4.497852802276611, "train/global_grad_norm": 0.5793102979660034}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 4629.746722721, "train/update_time": 2115.698471894517, "train/lr": 0.0003379662370102746, "train/loss": 4.479738235473633, "train/global_grad_norm": 0.5910903811454773}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 4664.52624743199, "train/update_time": 2150.3571494565113, "train/lr": 0.00032303996298219405, "train/loss": 4.459190845489502, "train/global_grad_norm": 0.8015880584716797}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 4782.869420437986, "train/update_time": 2184.9856218354835, "train/lr": 0.00030829187271113034, "train/loss": 4.47064733505249, "train/global_grad_norm": 0.5343803763389587}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 4817.616289013007, "train/update_time": 2219.6062358425115, "train/lr": 0.0002937368162738445, "train/loss": 4.469577312469482, "train/global_grad_norm": 0.5645431280136108}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 4935.988276930002, "train/update_time": 2254.239767934516, "train/lr": 0.0002793894493783894, "train/loss": 4.448817729949951, "train/global_grad_norm": 0.5171424150466919}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 4970.727602478, "train/update_time": 2288.853339902591, "train/lr": 0.00026526421860705474, "train/loss": 4.462049961090088, "train/global_grad_norm": 0.5886797308921814}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 5089.18058301101, "train/update_time": 2323.4854529426375, "train/lr": 0.0002513753468698824, "train/loss": 4.417477607727051, "train/global_grad_norm": 0.5379060506820679}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 5123.909539049986, "train/update_time": 2358.095918665669, "train/lr": 0.00023773681908340283, "train/loss": 4.435017108917236, "train/global_grad_norm": 0.6538751125335693}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 5242.36110436701, "train/update_time": 2392.718186916667, "train/lr": 0.00022436236808900823, "train/loss": 4.440553665161133, "train/global_grad_norm": 0.4718034267425537}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 5277.0870255730115, "train/update_time": 2427.330438608711, "train/lr": 0.00021126546082514682, "train/loss": 4.433934688568115, "train/global_grad_norm": 0.49807751178741455}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 5395.478063126997, "train/update_time": 2461.9552631227416, "train/lr": 0.00019845928476725522, "train/loss": 4.4274396896362305, "train/global_grad_norm": 0.4243120551109314}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 5430.249512759008, "train/update_time": 2496.5911681566795, "train/lr": 0.0001859567346490913, "train/loss": 4.42873477935791, "train/global_grad_norm": 0.5035505890846252}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 5548.174287694012, "train/update_time": 2531.217117117718, "train/lr": 0.00017377039947882782, "train/loss": 4.433116912841797, "train/global_grad_norm": 0.4461386501789093}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 5582.925107155985, "train/update_time": 2565.8480685587565, "train/lr": 0.00016191254986299043, "train/loss": 4.430774211883545, "train/global_grad_norm": 0.4564478397369385}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 5700.939995903987, "train/update_time": 2600.5040378217527, "train/lr": 0.00015039512565099468, "train/loss": 4.417412757873535, "train/global_grad_norm": 0.41517674922943115}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 5735.700029758998, "train/update_time": 2635.1292617027066, "train/lr": 0.00013922972391273224, "train/loss": 4.381819248199463, "train/global_grad_norm": 0.42953088879585266}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 5853.721849002992, "train/update_time": 2669.7465154657257, "train/lr": 0.00012842758726130281, "train/loss": 4.441737651824951, "train/global_grad_norm": 0.42887547612190247}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 5888.469226193993, "train/update_time": 2704.3640278247476, "train/lr": 0.00011799959253265679, "train/loss": 4.363431453704834, "train/global_grad_norm": 0.3946261703968048}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 6006.954968397011, "train/update_time": 2738.9892221787595, "train/lr": 0.00010795623983354214, "train/loss": 4.401818752288818, "train/global_grad_norm": 0.3415951728820801}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 6041.680216562003, "train/update_time": 2773.58810844674, "train/lr": 9.830764196878872e-05, "train/loss": 4.37397575378418, "train/global_grad_norm": 0.397240549325943}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 6160.038256887987, "train/update_time": 2808.195452109765, "train/lr": 8.906351425856951e-05, "train/loss": 4.353301048278809, "train/global_grad_norm": 0.42882615327835083}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 6194.783387269999, "train/update_time": 2842.820589903771, "train/lr": 8.02331647558977e-05, "train/loss": 4.367002487182617, "train/global_grad_norm": 0.3661589026451111}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 6312.718072921998, "train/update_time": 2877.4299484348157, "train/lr": 7.182548487420554e-05, "train/loss": 4.391452789306641, "train/global_grad_norm": 0.31681889295578003}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 6347.4543838310055, "train/update_time": 2912.0468749527645, "train/lr": 6.384894043444556e-05, "train/loss": 4.364299774169922, "train/global_grad_norm": 0.31542593240737915}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 6465.570202954987, "train/update_time": 2946.6770034248184, "train/lr": 5.6311563140726166e-05, "train/loss": 4.427130699157715, "train/global_grad_norm": 0.3284689784049988}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 6500.321636478999, "train/update_time": 2981.296865779761, "train/lr": 4.922094249306547e-05, "train/loss": 4.397003650665283, "train/global_grad_norm": 0.3037455081939697}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 6618.489081607986, "train/update_time": 3015.9109904506768, "train/lr": 4.2584218145409916e-05, "train/loss": 4.349207878112793, "train/global_grad_norm": 0.31362658739089966}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 6653.224929338001, "train/update_time": 3050.5236358706316, "train/lr": 3.6408072716606236e-05, "train/loss": 4.384881019592285, "train/global_grad_norm": 0.2942019999027252}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 6771.968496516987, "train/update_time": 3085.1793869955873, "train/lr": 3.069872506157217e-05, "train/loss": 4.42058801651001, "train/global_grad_norm": 0.2945297360420227}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 6806.725144091994, "train/update_time": 3119.802880719566, "train/lr": 2.5461924009435368e-05, "train/loss": 4.3490190505981445, "train/global_grad_norm": 0.28907138109207153}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 6925.848634602007, "train/update_time": 3154.4264403765555, "train/lr": 2.0702942574950812e-05, "train/loss": 4.378040313720703, "train/global_grad_norm": 0.2773731052875519}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 6960.596640536998, "train/update_time": 3189.047608219611, "train/lr": 1.642657264902142e-05, "train/loss": 4.41549015045166, "train/global_grad_norm": 0.26614734530448914}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 7079.1912472849945, "train/update_time": 3223.6655410806416, "train/lr": 1.2637120173670358e-05, "train/loss": 4.391122341156006, "train/global_grad_norm": 0.2462288737297058}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 7113.936619379994, "train/update_time": 3258.2859199085797, "train/lr": 9.338400806321978e-06, "train/loss": 4.345850467681885, "train/global_grad_norm": 0.24034488201141357}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 7231.90277238001, "train/update_time": 3292.912114331586, "train/lr": 6.533736077758867e-06, "train/loss": 4.3816142082214355, "train/global_grad_norm": 0.25654977560043335}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 7266.675624558004, "train/update_time": 3327.5515688046, "train/lr": 4.2259500476214406e-06, "train/loss": 4.384517669677734, "train/global_grad_norm": 0.25040480494499207}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 7384.794404487009, "train/update_time": 3362.1696882025863, "train/lr": 2.417366460819359e-06, "train/loss": 4.396021366119385, "train/global_grad_norm": 0.23735737800598145}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 7419.518831352005, "train/update_time": 3396.797874707612, "train/lr": 1.1098064077174619e-06, "train/loss": 4.376190185546875, "train/global_grad_norm": 0.22627969086170197}
 
 
metrics/jsonlines/train_data_info.jsonl CHANGED
@@ -1 +1 @@
1
- {"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2097152000, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024}
 
1
+ {"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024}
metrics/jsonlines/train_eval.jsonl CHANGED
@@ -1,20 +1,19 @@
1
- {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 192.62442071689293, "train_eval/train_update_time": 178.92916662013158, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.315832251080282, "train_eval/perplexity_len_2048": 4088.086336187953, "train_eval/loss_avg_len_1024": 8.314682703830403, "train_eval/perplexity_len_1024": 4083.389587867191, "train_eval/loss_avg_len_512": 8.31643846753548, "train_eval/perplexity_len_512": 4090.5653527296113}
2
- {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 379.894994311966, "train_eval/train_update_time": 352.43726607831195, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.5421175704316195, "train_eval/perplexity_len_2048": 693.7540968632107, "train_eval/loss_avg_len_1024": 6.542326334629033, "train_eval/perplexity_len_1024": 693.8989429992624, "train_eval/loss_avg_len_512": 6.550862833563443, "train_eval/perplexity_len_512": 699.847765520239}
3
- {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 573.9008510387503, "train_eval/train_update_time": 525.9637654465623, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.869219247170731, "train_eval/perplexity_len_2048": 353.9725073167194, "train_eval/loss_avg_len_1024": 5.8752450715943265, "train_eval/perplexity_len_1024": 356.11192289429863, "train_eval/loss_avg_len_512": 5.893379021769796, "train_eval/perplexity_len_512": 362.6285462462386}
4
- {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 761.2827736386098, "train_eval/train_update_time": 699.5182051258162, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.457584580215244, "train_eval/perplexity_len_2048": 234.5302506521384, "train_eval/loss_avg_len_1024": 5.464846401979521, "train_eval/perplexity_len_1024": 236.23956639223803, "train_eval/loss_avg_len_512": 5.485704620736287, "train_eval/perplexity_len_512": 241.21885193572683}
5
- {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 955.4245519889519, "train_eval/train_update_time": 873.0867990762927, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.188593013403133, "train_eval/perplexity_len_2048": 179.21622063900182, "train_eval/loss_avg_len_1024": 5.198872200285332, "train_eval/perplexity_len_1024": 181.06791833216104, "train_eval/loss_avg_len_512": 5.222964342518463, "train_eval/perplexity_len_512": 185.48320567765194}
6
- {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1142.651667741593, "train_eval/train_update_time": 1046.612968060188, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.984366537193437, "train_eval/perplexity_len_2048": 146.11098982692764, "train_eval/loss_avg_len_1024": 4.992764958058906, "train_eval/perplexity_len_1024": 147.3432587259727, "train_eval/loss_avg_len_512": 5.018679606585719, "train_eval/perplexity_len_512": 151.21151323577098}
7
- {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1336.7052892716601, "train_eval/train_update_time": 1220.2067804792896, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.845627412213998, "train_eval/perplexity_len_2048": 127.1830531673866, "train_eval/loss_avg_len_1024": 4.8556679495035135, "train_eval/perplexity_len_1024": 128.46647167899766, "train_eval/loss_avg_len_512": 4.881734219078862, "train_eval/perplexity_len_512": 131.85913834962972}
8
- {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1524.0658300309442, "train_eval/train_update_time": 1393.7609866121784, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.737207169464346, "train_eval/perplexity_len_2048": 114.11505222000658, "train_eval/loss_avg_len_1024": 4.749944012476372, "train_eval/perplexity_len_1024": 115.57781343047871, "train_eval/loss_avg_len_512": 4.7793406943716406, "train_eval/perplexity_len_512": 119.02584975715152}
9
- {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1718.264860745985, "train_eval/train_update_time": 1567.3857053546235, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.653518524739175, "train_eval/perplexity_len_2048": 104.95361857807349, "train_eval/loss_avg_len_1024": 4.667017787478289, "train_eval/perplexity_len_1024": 106.38002108070093, "train_eval/loss_avg_len_512": 4.694421348995893, "train_eval/perplexity_len_512": 109.33552322319544}
10
- {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1905.7458327147178, "train_eval/train_update_time": 1741.0785799045116, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.591110766174298, "train_eval/perplexity_len_2048": 98.60389522711793, "train_eval/loss_avg_len_1024": 4.606421255296846, "train_eval/perplexity_len_1024": 100.12518522224212, "train_eval/loss_avg_len_512": 4.633000077941105, "train_eval/perplexity_len_512": 102.82207609183887}
11
- {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2099.768523535691, "train_eval/train_update_time": 1914.6644405797124, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.541835638668454, "train_eval/perplexity_len_2048": 93.86294052050336, "train_eval/loss_avg_len_1024": 4.556959745584936, "train_eval/perplexity_len_1024": 95.29332303224535, "train_eval/loss_avg_len_512": 4.585887224406106, "train_eval/perplexity_len_512": 98.09017654547195}
12
- {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2287.0779132805765, "train_eval/train_update_time": 2088.2027000347152, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.501627793064781, "train_eval/perplexity_len_2048": 90.1637798869589, "train_eval/loss_avg_len_1024": 4.514900127739493, "train_eval/perplexity_len_1024": 91.3684403940161, "train_eval/loss_avg_len_512": 4.543377172959445, "train_eval/perplexity_len_512": 94.00774504386287}
13
- {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2481.136555264704, "train_eval/train_update_time": 2261.7757655889727, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.470823071599252, "train_eval/perplexity_len_2048": 87.42865344904568, "train_eval/loss_avg_len_1024": 4.484475021407034, "train_eval/perplexity_len_1024": 88.63040952975192, "train_eval/loss_avg_len_512": 4.515942591798957, "train_eval/perplexity_len_512": 91.46373837302495}
14
- {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2668.4236754849553, "train_eval/train_update_time": 2435.308184158057, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.440055598446761, "train_eval/perplexity_len_2048": 84.77965515993932, "train_eval/loss_avg_len_1024": 4.45260463735638, "train_eval/perplexity_len_1024": 85.8502618440064, "train_eval/loss_avg_len_512": 4.47954113858701, "train_eval/perplexity_len_512": 88.1941944767248}
15
- {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2862.4817315279506, "train_eval/train_update_time": 2608.8764487369917, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.425654130871717, "train_eval/perplexity_len_2048": 83.56745339835582, "train_eval/loss_avg_len_1024": 4.439360764590346, "train_eval/perplexity_len_1024": 84.72076784601799, "train_eval/loss_avg_len_512": 4.470357606542384, "train_eval/perplexity_len_512": 87.38796793547316}
16
- {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3049.8850160585716, "train_eval/train_update_time": 2782.4521241658367, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.4093976501172625, "train_eval/perplexity_len_2048": 82.21992342423559, "train_eval/loss_avg_len_1024": 4.424986717673674, "train_eval/perplexity_len_1024": 83.51169798504992, "train_eval/loss_avg_len_512": 4.45332414098084, "train_eval/perplexity_len_512": 85.91205364561085}
17
- {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3243.933703172952, "train_eval/train_update_time": 2956.025475362316, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.402028011444163, "train_eval/perplexity_len_2048": 81.61621956884866, "train_eval/loss_avg_len_1024": 4.4154639075870366, "train_eval/perplexity_len_1024": 82.72020653778047, "train_eval/loss_avg_len_512": 4.443720440966718, "train_eval/perplexity_len_512": 85.0909292819203}
18
- {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3431.2398387999274, "train_eval/train_update_time": 3129.5652695768513, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.393143219807316, "train_eval/perplexity_len_2048": 80.8942883172351, "train_eval/loss_avg_len_1024": 4.408541841555816, "train_eval/perplexity_len_1024": 82.14958901053713, "train_eval/loss_avg_len_512": 4.4367809793916235, "train_eval/perplexity_len_512": 84.50248814153937}
19
- {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3625.2997248456813, "train_eval/train_update_time": 3303.175303027965, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.384034915962776, "train_eval/perplexity_len_2048": 80.16082393891234, "train_eval/loss_avg_len_1024": 4.396927037636997, "train_eval/perplexity_len_1024": 81.20095739175648, "train_eval/loss_avg_len_512": 4.4252965194482385, "train_eval/perplexity_len_512": 83.53757406530342}
20
- {"step": 2097152000, "train_eval/train_token_count": 2097152000, "train_eval/train_batch_count": 1000, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3812.7206143867224, "train_eval/train_update_time": 3476.8136685648933, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.382946988567537, "train_eval/perplexity_len_2048": 80.07366220394087, "train_eval/loss_avg_len_1024": 4.400711219589484, "train_eval/perplexity_len_1024": 81.50881872350878, "train_eval/loss_avg_len_512": 4.429669063854925, "train_eval/perplexity_len_512": 83.90364556623013}
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 345.1667736689851, "train_eval/train_update_time": 176.83618369704345, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.311106187544976, "train_eval/perplexity_len_2048": 4068.8113636742964, "train_eval/loss_avg_len_1024": 8.310348202390452, "train_eval/perplexity_len_1024": 4065.7284336192884, "train_eval/loss_avg_len_512": 8.311592859712693, "train_eval/perplexity_len_512": 4070.7920228474427}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 686.6137485890067, "train_eval/train_update_time": 349.9022945231409, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.5199917088173605, "train_eval/perplexity_len_2048": 678.5727591454385, "train_eval/loss_avg_len_1024": 6.520226818787997, "train_eval/perplexity_len_1024": 678.7323171230164, "train_eval/loss_avg_len_512": 6.528576857324952, "train_eval/perplexity_len_512": 684.4234857990103}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1111.7393740309926, "train_eval/train_update_time": 523.0301243920985, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.841284574894999, "train_eval/perplexity_len_2048": 344.22123473765475, "train_eval/loss_avg_len_1024": 5.846098003384759, "train_eval/perplexity_len_1024": 345.8821130885547, "train_eval/loss_avg_len_512": 5.863969622435542, "train_eval/perplexity_len_512": 352.1191534491973}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1452.0386730069877, "train_eval/train_update_time": 696.2396770050982, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.426266934516625, "train_eval/perplexity_len_2048": 227.29913709313033, "train_eval/loss_avg_len_1024": 5.4348094870870405, "train_eval/perplexity_len_1024": 229.24916918814984, "train_eval/loss_avg_len_512": 5.45689960326592, "train_eval/perplexity_len_512": 234.36965784398754}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1875.7684185520047, "train_eval/train_update_time": 869.3637212922331, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.1507950319237175, "train_eval/perplexity_len_2048": 172.56863336563757, "train_eval/loss_avg_len_1024": 5.159889481119135, "train_eval/perplexity_len_1024": 174.1452082080109, "train_eval/loss_avg_len_512": 5.183720081899664, "train_eval/perplexity_len_512": 178.3450366054668}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2215.8320906269946, "train_eval/train_update_time": 1042.4422394274152, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.964018276003353, "train_eval/perplexity_len_2048": 143.16792986712343, "train_eval/loss_avg_len_1024": 4.973020432846388, "train_eval/perplexity_len_1024": 144.46256855394853, "train_eval/loss_avg_len_512": 4.997246175480032, "train_eval/perplexity_len_512": 148.00501753900681}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2639.4180885159876, "train_eval/train_update_time": 1215.5536740703392, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.823759766676467, "train_eval/perplexity_len_2048": 124.432047871971, "train_eval/loss_avg_len_1024": 4.835695269690004, "train_eval/perplexity_len_1024": 125.92610538184168, "train_eval/loss_avg_len_512": 4.86114800382129, "train_eval/perplexity_len_512": 129.17240743830268}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2979.819751534, "train_eval/train_update_time": 1388.5897308025742, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.721651509671638, "train_eval/perplexity_len_2048": 112.35365267475348, "train_eval/loss_avg_len_1024": 4.73291551488037, "train_eval/perplexity_len_1024": 113.62635923386715, "train_eval/loss_avg_len_512": 4.758365783066256, "train_eval/perplexity_len_512": 116.55529353968126}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3405.285965647985, "train_eval/train_update_time": 1561.695067133638, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.641861000917015, "train_eval/perplexity_len_2048": 103.73722312419866, "train_eval/loss_avg_len_1024": 4.656030186131794, "train_eval/perplexity_len_1024": 105.21755785416276, "train_eval/loss_avg_len_512": 4.684107663316537, "train_eval/perplexity_len_512": 108.2136661880101}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3745.514067212993, "train_eval/train_update_time": 1734.9429978527187, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.580971560182552, "train_eval/perplexity_len_2048": 97.60918134794116, "train_eval/loss_avg_len_1024": 4.593989343002031, "train_eval/perplexity_len_1024": 98.88814303333918, "train_eval/loss_avg_len_512": 4.622232269574961, "train_eval/perplexity_len_512": 101.72084723154869}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4169.554661403992, "train_eval/train_update_time": 1907.990586347587, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.528221095977534, "train_eval/perplexity_len_2048": 92.59369918490609, "train_eval/loss_avg_len_1024": 4.539339858165622, "train_eval/perplexity_len_1024": 93.628971312955, "train_eval/loss_avg_len_512": 4.565371130157291, "train_eval/perplexity_len_512": 96.09825235868954}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4510.643669799989, "train_eval/train_update_time": 2081.064080615528, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.489654328562519, "train_eval/perplexity_len_2048": 89.09064446424271, "train_eval/loss_avg_len_1024": 4.500532126115468, "train_eval/perplexity_len_1024": 90.06504451373176, "train_eval/loss_avg_len_512": 4.527173429640897, "train_eval/perplexity_len_512": 92.49674268118629}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4935.988276930002, "train_eval/train_update_time": 2254.239767934516, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.46553069598358, "train_eval/perplexity_len_2048": 86.96717042332567, "train_eval/loss_avg_len_1024": 4.480923175239441, "train_eval/perplexity_len_1024": 88.3161663514099, "train_eval/loss_avg_len_512": 4.508107499028265, "train_eval/perplexity_len_512": 90.74991159895295}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5277.0870255730115, "train_eval/train_update_time": 2427.330438608711, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.4373238263875106, "train_eval/perplexity_len_2048": 84.54837251634272, "train_eval/loss_avg_len_1024": 4.452436186897358, "train_eval/perplexity_len_1024": 85.83580154594769, "train_eval/loss_avg_len_512": 4.48180135221679, "train_eval/perplexity_len_512": 88.39375763988248}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5700.939995903987, "train_eval/train_update_time": 2600.5040378217527, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.417549327801252, "train_eval/perplexity_len_2048": 82.89289292787865, "train_eval/loss_avg_len_1024": 4.432807433758753, "train_eval/perplexity_len_1024": 84.1673798737353, "train_eval/loss_avg_len_512": 4.461872710547686, "train_eval/perplexity_len_512": 86.64962691935622}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6041.680216562003, "train_eval/train_update_time": 2773.58810844674, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.4033073637809235, "train_eval/perplexity_len_2048": 81.72070229092803, "train_eval/loss_avg_len_1024": 4.419483624122986, "train_eval/perplexity_len_1024": 83.05338751777346, "train_eval/loss_avg_len_512": 4.448178964478576, "train_eval/perplexity_len_512": 85.4711561862065}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6465.570202954987, "train_eval/train_update_time": 2946.6770034248184, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.387876043058186, "train_eval/perplexity_len_2048": 80.46932396633433, "train_eval/loss_avg_len_1024": 4.399068052637449, "train_eval/perplexity_len_1024": 81.37499610284682, "train_eval/loss_avg_len_512": 4.427956537717109, "train_eval/perplexity_len_512": 83.76008134396787}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6806.725144091994, "train_eval/train_update_time": 3119.802880719566, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.387491640440776, "train_eval/perplexity_len_2048": 80.43839729210855, "train_eval/loss_avg_len_1024": 4.402047658253414, "train_eval/perplexity_len_1024": 81.61782308289827, "train_eval/loss_avg_len_512": 4.431004746926119, "train_eval/perplexity_len_512": 84.01578912267684}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7231.90277238001, "train_eval/train_update_time": 3292.912114331586, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.379340490187296, "train_eval/perplexity_len_2048": 79.78539679771428, "train_eval/loss_avg_len_1024": 4.395910125486225, "train_eval/perplexity_len_1024": 81.11842512267636, "train_eval/loss_avg_len_512": 4.423583207665869, "train_eval/perplexity_len_512": 83.39457069504783}
 
metrics/jsonlines/val.jsonl CHANGED
@@ -1,50 +1,49 @@
1
- {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 75.34514997294173, "val/train_update_time": 74.81576991919428, "val/loss": 8.063457628035453, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.1555146356113255, "val/val_tokens_per_second": 332709.7929638187, "val/loss_avg_len_2048": 8.063457628035453, "val/perplexity_len_2048": 3176.253472917075, "val/loss_avg_len_1024": 8.072010921318084, "val/perplexity_len_1024": 3203.5374178641846, "val/loss_avg_len_512": 8.077348537478596, "val/perplexity_len_512": 3220.682386883231}
2
- {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 151.4883820596151, "val/train_update_time": 144.2230770494789, "val/loss": 7.259054527005647, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.139596086926758, "val/val_tokens_per_second": 333572.4322909243, "val/loss_avg_len_2048": 7.259054527005647, "val/perplexity_len_2048": 1420.9124675451194, "val/loss_avg_len_1024": 7.26913984442316, "val/perplexity_len_1024": 1435.3153274292524, "val/loss_avg_len_512": 7.272929334567859, "val/perplexity_len_512": 1440.7647594803482}
3
- {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 227.60141262365505, "val/train_update_time": 213.61536543117836, "val/loss": 6.787688369745388, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.167570411693305, "val/val_tokens_per_second": 332059.44371824723, "val/loss_avg_len_2048": 6.787688369745388, "val/perplexity_len_2048": 886.8610955329386, "val/loss_avg_len_1024": 6.796155215400271, "val/perplexity_len_1024": 894.4018898686921, "val/loss_avg_len_512": 6.801723655236885, "val/perplexity_len_512": 899.3962053497152}
4
- {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 303.7567360489629, "val/train_update_time": 283.02572092972696, "val/loss": 6.416513715648092, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.147497796919197, "val/val_tokens_per_second": 333143.67367912683, "val/loss_avg_len_2048": 6.416513715648092, "val/perplexity_len_2048": 611.8662516960949, "val/loss_avg_len_1024": 6.424904120774008, "val/perplexity_len_1024": 617.0216551479506, "val/loss_avg_len_512": 6.434457702171057, "val/perplexity_len_512": 622.9446697051466}
5
- {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 379.894994311966, "val/train_update_time": 352.43726607831195, "val/loss": 6.1374158065575175, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.152517178095877, "val/val_tokens_per_second": 332871.88653308706, "val/loss_avg_len_2048": 6.1374158065575175, "val/perplexity_len_2048": 462.85591481844574, "val/loss_avg_len_1024": 6.146038910266012, "val/perplexity_len_1024": 466.8644274453744, "val/loss_avg_len_512": 6.156553246997297, "val/perplexity_len_512": 471.7990941560618}
6
- {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 456.560782736633, "val/train_update_time": 421.85785795981064, "val/loss": 5.904801503770985, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.174046735279262, "val/val_tokens_per_second": 331711.12688497745, "val/loss_avg_len_2048": 5.904801503770985, "val/perplexity_len_2048": 366.79441125354293, "val/loss_avg_len_1024": 5.9155219648145145, "val/perplexity_len_1024": 370.74776949925194, "val/loss_avg_len_512": 5.930047927273438, "val/perplexity_len_512": 376.1725423170029}
7
- {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 532.7168368049897, "val/train_update_time": 491.2538420544006, "val/loss": 5.705157285411376, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.181066912133247, "val/val_tokens_per_second": 331334.3843568233, "val/loss_avg_len_2048": 5.705157285411376, "val/perplexity_len_2048": 300.4127268722159, "val/loss_avg_len_1024": 5.716063063982874, "val/perplexity_len_1024": 303.7068916176884, "val/loss_avg_len_512": 5.731860842535273, "val/perplexity_len_512": 308.5428842304303}
8
- {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 608.8816099567339, "val/train_update_time": 560.655514428392, "val/loss": 5.553795693997759, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.204883788712323, "val/val_tokens_per_second": 330062.587751545, "val/loss_avg_len_2048": 5.553795693997759, "val/perplexity_len_2048": 258.21580634789547, "val/loss_avg_len_1024": 5.564789746439084, "val/perplexity_len_1024": 261.0703069771625, "val/loss_avg_len_512": 5.58031883784756, "val/perplexity_len_512": 265.1561341211652}
9
- {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 685.1029845466837, "val/train_update_time": 630.085222561378, "val/loss": 5.417899778988678, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.160940235015005, "val/val_tokens_per_second": 332416.7938459173, "val/loss_avg_len_2048": 5.417899778988678, "val/perplexity_len_2048": 225.4052242426706, "val/loss_avg_len_1024": 5.429507627706044, "val/perplexity_len_1024": 228.0369387072969, "val/loss_avg_len_512": 5.447417815888301, "val/perplexity_len_512": 232.15791678018914}
10
- {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 761.2827736386098, "val/train_update_time": 699.5182051258162, "val/loss": 5.30057149405405, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.174438063055277, "val/val_tokens_per_second": 331690.10346936656, "val/loss_avg_len_2048": 5.30057149405405, "val/perplexity_len_2048": 200.4513339922801, "val/loss_avg_len_1024": 5.312365423660726, "val/perplexity_len_1024": 202.82943895069067, "val/loss_avg_len_512": 5.329608726950363, "val/perplexity_len_512": 206.35721634080795}
11
- {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 837.9679110529833, "val/train_update_time": 768.9465256030671, "val/loss": 5.197990671986248, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.2184945940971375, "val/val_tokens_per_second": 329340.1592636343, "val/loss_avg_len_2048": 5.197990671986248, "val/perplexity_len_2048": 180.908372170655, "val/loss_avg_len_1024": 5.210480375316925, "val/perplexity_len_1024": 183.1820331887663, "val/loss_avg_len_512": 5.228735287578776, "val/perplexity_len_512": 186.55671366571542}
12
- {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 914.1905289897695, "val/train_update_time": 838.3647430227138, "val/loss": 5.112300290121231, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.2173577500507236, "val/val_tokens_per_second": 329400.3791213867, "val/loss_avg_len_2048": 5.112300290121231, "val/perplexity_len_2048": 166.05188339934293, "val/loss_avg_len_1024": 5.1255862217117105, "val/perplexity_len_1024": 168.27275788769515, "val/loss_avg_len_512": 5.144388560552151, "val/perplexity_len_512": 171.46661116237564}
13
- {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 990.4083167887293, "val/train_update_time": 907.7796153570525, "val/loss": 5.0311500638867725, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.124237969983369, "val/val_tokens_per_second": 334408.95178107545, "val/loss_avg_len_2048": 5.0311500638867725, "val/perplexity_len_2048": 153.10899660758514, "val/loss_avg_len_1024": 5.044079258794897, "val/perplexity_len_1024": 155.1014251603314, "val/loss_avg_len_512": 5.064098207587376, "val/perplexity_len_512": 158.23768017318574}
14
- {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 1066.5353656006046, "val/train_update_time": 977.1970872837119, "val/loss": 4.965538669798989, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.118477249983698, "val/val_tokens_per_second": 334723.8073011478, "val/loss_avg_len_2048": 4.965538669798989, "val/perplexity_len_2048": 143.38576705662052, "val/loss_avg_len_1024": 4.9786326410686605, "val/perplexity_len_1024": 145.27560189118765, "val/loss_avg_len_512": 4.999514865677805, "val/perplexity_len_512": 148.34117624731914}
15
- {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 1142.651667741593, "val/train_update_time": 1046.612968060188, "val/loss": 4.902305538183544, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.1723534651100636, "val/val_tokens_per_second": 331802.1256521609, "val/loss_avg_len_2048": 4.902305538183544, "val/perplexity_len_2048": 134.5997470831922, "val/loss_avg_len_1024": 4.916661380067468, "val/perplexity_len_1024": 136.54597622406206, "val/loss_avg_len_512": 4.937983042396605, "val/perplexity_len_512": 139.48862300231212}
16
- {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 1219.3308540079743, "val/train_update_time": 1116.0336452010088, "val/loss": 4.8476240368089645, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.166513349860907, "val/val_tokens_per_second": 332116.3652465287, "val/loss_avg_len_2048": 4.8476240368089645, "val/perplexity_len_2048": 127.43724365645022, "val/loss_avg_len_1024": 4.862438888111152, "val/perplexity_len_1024": 129.33926174137744, "val/loss_avg_len_512": 4.884251133620738, "val/perplexity_len_512": 132.19143453755416}
17
- {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 1295.5154801327735, "val/train_update_time": 1185.4742206893861, "val/loss": 4.799447789161467, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.1656147059984505, "val/val_tokens_per_second": 332164.77150405233, "val/loss_avg_len_2048": 4.799447789161467, "val/perplexity_len_2048": 121.44333667227617, "val/loss_avg_len_1024": 4.814338387086057, "val/perplexity_len_1024": 123.26523145670933, "val/loss_avg_len_512": 4.836129650367424, "val/perplexity_len_512": 125.98081713080528}
18
- {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 1371.6966382889077, "val/train_update_time": 1254.908473377116, "val/loss": 4.759981225318275, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.152246905025095, "val/val_tokens_per_second": 332886.5098582461, "val/loss_avg_len_2048": 4.759981225318275, "val/perplexity_len_2048": 116.74373405196414, "val/loss_avg_len_1024": 4.775360995794646, "val/perplexity_len_1024": 118.55310406886426, "val/loss_avg_len_512": 4.797722522171214, "val/perplexity_len_512": 121.2339951292825}
19
- {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 1447.851369981654, "val/train_update_time": 1324.3309777188115, "val/loss": 4.7207601572135465, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.199615178164095, "val/val_tokens_per_second": 330343.08439229266, "val/loss_avg_len_2048": 4.7207601572135465, "val/perplexity_len_2048": 112.2535505900136, "val/loss_avg_len_1024": 4.735622255532071, "val/perplexity_len_1024": 113.9343329339478, "val/loss_avg_len_512": 4.759110804993659, "val/perplexity_len_512": 116.64216214461288}
20
- {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 1524.0658300309442, "val/train_update_time": 1393.7609866121784, "val/loss": 4.68513719416922, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.1629162901081145, "val/val_tokens_per_second": 332310.20893260784, "val/loss_avg_len_2048": 4.68513719416922, "val/perplexity_len_2048": 108.32513286537875, "val/loss_avg_len_1024": 4.700483996457421, "val/perplexity_len_1024": 110.0003993738755, "val/loss_avg_len_512": 4.724669764202089, "val/perplexity_len_512": 112.69327687426379}
21
- {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 1600.7871080269106, "val/train_update_time": 1463.2190964017063, "val/loss": 4.653494824603666, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.228256469126791, "val/val_tokens_per_second": 328823.9670527139, "val/loss_avg_len_2048": 4.653494824603666, "val/perplexity_len_2048": 104.9511311925668, "val/loss_avg_len_1024": 4.668328476385213, "val/perplexity_len_1024": 106.51954360959127, "val/loss_avg_len_512": 4.6926654254719615, "val/perplexity_len_512": 109.14370686268144}
22
- {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 1677.0360736018047, "val/train_update_time": 1532.649858857505, "val/loss": 4.633331594252493, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.194473575334996, "val/val_tokens_per_second": 330617.2792720719, "val/loss_avg_len_2048": 4.633331594252493, "val/perplexity_len_2048": 102.8561689380891, "val/loss_avg_len_1024": 4.648075810411945, "val/perplexity_len_1024": 104.38393772592255, "val/loss_avg_len_512": 4.671412990957498, "val/perplexity_len_512": 106.84861194121774}
23
- {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 1753.2634326359257, "val/train_update_time": 1602.0879544354975, "val/loss": 4.60316979375016, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.166054365690798, "val/val_tokens_per_second": 332141.0870775801, "val/loss_avg_len_2048": 4.60316979375016, "val/perplexity_len_2048": 99.80016072131704, "val/loss_avg_len_1024": 4.618668203471229, "val/perplexity_len_1024": 101.35895269898852, "val/loss_avg_len_512": 4.643125790521503, "val/perplexity_len_512": 103.86851189443864}
24
- {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 1829.5858452958055, "val/train_update_time": 1671.6610745475627, "val/loss": 4.578808519658168, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.15863545704633, "val/val_tokens_per_second": 332541.19590027124, "val/loss_avg_len_2048": 4.578808519658168, "val/perplexity_len_2048": 97.39827691278425, "val/loss_avg_len_1024": 4.593942633476109, "val/perplexity_len_1024": 98.8835241229332, "val/loss_avg_len_512": 4.618188925094903, "val/perplexity_len_512": 101.31038518432233}
25
- {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 1905.7458327147178, "val/train_update_time": 1741.0785799045116, "val/loss": 4.558685332912952, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.169239658862352, "val/val_tokens_per_second": 331969.5964571531, "val/loss_avg_len_2048": 4.558685332912952, "val/perplexity_len_2048": 95.4579019397147, "val/loss_avg_len_1024": 4.573577723344788, "val/perplexity_len_1024": 96.8901365130558, "val/loss_avg_len_512": 4.598281057875603, "val/perplexity_len_512": 99.31345475312078}
26
- {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 1982.4242434157059, "val/train_update_time": 1810.507148906123, "val/loss": 4.538695250431076, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.162291600368917, "val/val_tokens_per_second": 332343.896202087, "val/loss_avg_len_2048": 4.538695250431076, "val/perplexity_len_2048": 93.56863680200442, "val/loss_avg_len_1024": 4.5537685374030845, "val/perplexity_len_1024": 94.98970690890154, "val/loss_avg_len_512": 4.579602122461424, "val/perplexity_len_512": 97.47560313747051}
27
- {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 2058.594140216708, "val/train_update_time": 1879.9355728020892, "val/loss": 4.521687246968504, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.149397691246122, "val/val_tokens_per_second": 333040.74688735744, "val/loss_avg_len_2048": 4.521687246968504, "val/perplexity_len_2048": 91.99067810489285, "val/loss_avg_len_1024": 4.537178126424551, "val/perplexity_len_1024": 93.42678920425934, "val/loss_avg_len_512": 4.56306814334169, "val/perplexity_len_512": 95.87719399545632}
28
- {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 2134.75423188461, "val/train_update_time": 1949.358494934626, "val/loss": 4.506162730196957, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.153092973865569, "val/val_tokens_per_second": 332840.73695921764, "val/loss_avg_len_2048": 4.506162730196957, "val/perplexity_len_2048": 90.57359550202705, "val/loss_avg_len_1024": 4.521539099274204, "val/perplexity_len_1024": 91.97705090747833, "val/loss_avg_len_512": 4.5466752190310515, "val/perplexity_len_512": 94.31829874682157}
29
- {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 2210.907703721896, "val/train_update_time": 2018.7735858242959, "val/loss": 4.492135846415069, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.156746225897223, "val/val_tokens_per_second": 332643.2379794158, "val/loss_avg_len_2048": 4.492135846415069, "val/perplexity_len_2048": 89.31199902303008, "val/loss_avg_len_1024": 4.50722113605868, "val/perplexity_len_1024": 90.66950987563759, "val/loss_avg_len_512": 4.533337791632861, "val/perplexity_len_512": 93.06868711127815}
30
- {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 2287.0779132805765, "val/train_update_time": 2088.2027000347152, "val/loss": 4.477499908717815, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.162066364195198, "val/val_tokens_per_second": 332356.0440536542, "val/loss_avg_len_2048": 4.477499908717815, "val/perplexity_len_2048": 88.01435346353732, "val/loss_avg_len_1024": 4.492435104093515, "val/perplexity_len_1024": 89.33873032408911, "val/loss_avg_len_512": 4.519270285155624, "val/perplexity_len_512": 91.76860862360576}
31
- {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 2363.7743464875966, "val/train_update_time": 2157.6409200155176, "val/loss": 4.466432400722988, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.170192903839052, "val/val_tokens_per_second": 331918.30983529676, "val/loss_avg_len_2048": 4.466432400722988, "val/perplexity_len_2048": 87.04562449895947, "val/loss_avg_len_1024": 4.481367357213051, "val/perplexity_len_1024": 88.35540351405655, "val/loss_avg_len_512": 4.507999668482691, "val/perplexity_len_512": 90.74012651404949}
32
- {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 2439.9411435807124, "val/train_update_time": 2227.0531149646267, "val/loss": 4.454004628794268, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.17543909791857, "val/val_tokens_per_second": 331636.3367084743, "val/loss_avg_len_2048": 4.454004628794268, "val/perplexity_len_2048": 85.97053564703114, "val/loss_avg_len_1024": 4.468851906768977, "val/perplexity_len_1024": 87.25648690227555, "val/loss_avg_len_512": 4.4957866591122, "val/perplexity_len_512": 89.63865632255882}
33
- {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 2516.129153979942, "val/train_update_time": 2296.475023902487, "val/loss": 4.443331138379872, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.151692705694586, "val/val_tokens_per_second": 332916.4992432373, "val/loss_avg_len_2048": 4.443331138379872, "val/perplexity_len_2048": 85.05780961023889, "val/loss_avg_len_1024": 4.458127858203836, "val/perplexity_len_1024": 86.32574368684402, "val/loss_avg_len_512": 4.4850040839873255, "val/perplexity_len_512": 88.67731296924332}
34
- {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 2592.2952946918085, "val/train_update_time": 2365.9068522513844, "val/loss": 4.434330381236039, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.142985037062317, "val/val_tokens_per_second": 333388.4076949322, "val/loss_avg_len_2048": 4.434330381236039, "val/perplexity_len_2048": 84.29566002975973, "val/loss_avg_len_1024": 4.449462922280654, "val/perplexity_len_1024": 85.58096802584274, "val/loss_avg_len_512": 4.4760593949873, "val/perplexity_len_512": 87.88765885346187}
35
- {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 2668.4236754849553, "val/train_update_time": 2435.308184158057, "val/loss": 4.425996998540591, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.174667646177113, "val/val_tokens_per_second": 331677.770749, "val/loss_avg_len_2048": 4.425996998540591, "val/perplexity_len_2048": 83.59611088887902, "val/loss_avg_len_1024": 4.441106808731332, "val/perplexity_len_1024": 84.86882326436888, "val/loss_avg_len_512": 4.468099794309586, "val/perplexity_len_512": 87.19088488445355}
36
- {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 2745.12359233154, "val/train_update_time": 2504.7536877635866, "val/loss": 4.418173751161899, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.165099292062223, "val/val_tokens_per_second": 332192.54110584565, "val/loss_avg_len_2048": 4.418173751161899, "val/perplexity_len_2048": 82.94466935017593, "val/loss_avg_len_1024": 4.433283036370762, "val/perplexity_len_1024": 84.20741962019845, "val/loss_avg_len_512": 4.460683553887531, "val/perplexity_len_512": 86.54664817946744}
37
- {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 2821.2778452136554, "val/train_update_time": 2574.1644478179514, "val/loss": 4.4118071908499115, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.198323728982359, "val/val_tokens_per_second": 330411.91288926767, "val/loss_avg_len_2048": 4.4118071908499115, "val/perplexity_len_2048": 82.41827455035343, "val/loss_avg_len_1024": 4.426996861943975, "val/perplexity_len_1024": 83.67973738138976, "val/loss_avg_len_512": 4.45429577216506, "val/perplexity_len_512": 85.995569042545}
38
- {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 2897.4854577975348, "val/train_update_time": 2643.5932636465877, "val/loss": 4.404871653239242, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.196773157920688, "val/val_tokens_per_second": 330494.5893302961, "val/loss_avg_len_2048": 4.404871653239242, "val/perplexity_len_2048": 81.8486371615448, "val/loss_avg_len_1024": 4.420073672283441, "val/perplexity_len_1024": 83.10240747694402, "val/loss_avg_len_512": 4.447471359204501, "val/perplexity_len_512": 85.41069773818887}
39
- {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 2973.6870251647197, "val/train_update_time": 2713.0167279425077, "val/loss": 4.400027697131039, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.18758595502004, "val/val_tokens_per_second": 330985.301034637, "val/loss_avg_len_2048": 4.400027697131039, "val/perplexity_len_2048": 81.45312465159279, "val/loss_avg_len_1024": 4.415215738720261, "val/perplexity_len_1024": 82.69968050493104, "val/loss_avg_len_512": 4.442661017941312, "val/perplexity_len_512": 85.00082972738426}
40
- {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 3049.8850160585716, "val/train_update_time": 2782.4521241658367, "val/loss": 4.3953972183647565, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.156525855883956, "val/val_tokens_per_second": 332655.1447912254, "val/loss_avg_len_2048": 4.3953972183647565, "val/perplexity_len_2048": 81.07682957298736, "val/loss_avg_len_1024": 4.410693335567601, "val/perplexity_len_1024": 82.32652362799742, "val/loss_avg_len_512": 4.438157716050744, "val/perplexity_len_512": 84.61890593468434}
41
- {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 3126.5600167936645, "val/train_update_time": 2851.8765539969318, "val/loss": 4.391358733704314, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.179119745269418, "val/val_tokens_per_second": 331438.794590103, "val/loss_avg_len_2048": 4.391358733704314, "val/perplexity_len_2048": 80.75006230684902, "val/loss_avg_len_1024": 4.40662108054664, "val/perplexity_len_1024": 81.99195072424767, "val/loss_avg_len_512": 4.4341642428375785, "val/perplexity_len_512": 84.28165644710379}
42
- {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 3202.7426801156253, "val/train_update_time": 2921.299470563419, "val/loss": 4.388228010935243, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.171495800837874, "val/val_tokens_per_second": 331848.23681188485, "val/loss_avg_len_2048": 4.388228010935243, "val/perplexity_len_2048": 80.4976515683695, "val/loss_avg_len_1024": 4.403468796270899, "val/perplexity_len_1024": 81.73389573227782, "val/loss_avg_len_512": 4.431057547293603, "val/perplexity_len_512": 84.02022530433199}
43
- {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 3278.91815946484, "val/train_update_time": 2990.718675683718, "val/loss": 4.385408209751826, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.151151229161769, "val/val_tokens_per_second": 332945.80537879013, "val/loss_avg_len_2048": 4.385408209751826, "val/perplexity_len_2048": 80.27098392425174, "val/loss_avg_len_1024": 4.400786334525794, "val/perplexity_len_1024": 81.51494148318838, "val/loss_avg_len_512": 4.428590759981423, "val/perplexity_len_512": 83.81322070171765}
44
- {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 3355.086739927996, "val/train_update_time": 3060.151473065838, "val/loss": 4.383290394572541, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.158799132332206, "val/val_tokens_per_second": 332532.3583372764, "val/loss_avg_len_2048": 4.383290394572541, "val/perplexity_len_2048": 80.10116470237571, "val/loss_avg_len_1024": 4.3987123784068975, "val/perplexity_len_1024": 81.34605826024934, "val/loss_avg_len_512": 4.4265458052326, "val/perplexity_len_512": 83.64200158536958}
45
- {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 3431.2398387999274, "val/train_update_time": 3129.5652695768513, "val/loss": 4.381882622797973, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.165901881176978, "val/val_tokens_per_second": 332149.3010214862, "val/loss_avg_len_2048": 4.381882622797973, "val/perplexity_len_2048": 79.98847987946432, "val/loss_avg_len_1024": 4.397319140967912, "val/perplexity_len_1024": 81.2328028005607, "val/loss_avg_len_512": 4.42517344260402, "val/perplexity_len_512": 83.52729315699766}
46
- {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 3507.942943904549, "val/train_update_time": 3199.003928860184, "val/loss": 4.38069165666448, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.16073360806331, "val/val_tokens_per_second": 332427.9428864657, "val/loss_avg_len_2048": 4.38069165666448, "val/perplexity_len_2048": 79.89327301418783, "val/loss_avg_len_1024": 4.396049066869542, "val/perplexity_len_1024": 81.12969661189547, "val/loss_avg_len_512": 4.423885196769611, "val/perplexity_len_512": 83.41975874977655}
47
- {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 3584.1315822978504, "val/train_update_time": 3268.4432333246805, "val/loss": 4.379863000438549, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.141148541122675, "val/val_tokens_per_second": 333488.1067093683, "val/loss_avg_len_2048": 4.379863000438549, "val/perplexity_len_2048": 79.82709637872192, "val/loss_avg_len_1024": 4.3952099743997675, "val/perplexity_len_1024": 81.06164984714975, "val/loss_avg_len_512": 4.422956028172746, "val/perplexity_len_512": 83.3422837288405}
48
- {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 3660.2930417479947, "val/train_update_time": 3337.873345853295, "val/loss": 4.379406773897819, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.161604348570108, "val/val_tokens_per_second": 332380.9651093986, "val/loss_avg_len_2048": 4.379406773897819, "val/perplexity_len_2048": 79.79068544513324, "val/loss_avg_len_1024": 4.394829520185478, "val/perplexity_len_1024": 81.03081546675509, "val/loss_avg_len_512": 4.422623653534799, "val/perplexity_len_512": 83.31458747048238}
49
- {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 3736.479068840854, "val/train_update_time": 3407.321344117634, "val/loss": 4.3791818157639355, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.1622035671025515, "val/val_tokens_per_second": 332348.6440684015, "val/loss_avg_len_2048": 4.3791818157639355, "val/perplexity_len_2048": 79.772737900233, "val/loss_avg_len_1024": 4.394583802897483, "val/perplexity_len_1024": 81.01090724053236, "val/loss_avg_len_512": 4.422379010090604, "val/perplexity_len_512": 83.2942075958549}
50
- {"step": 2097152000, "val/train_token_count": 2097152000, "val/train_batch_count": 1000, "val/train_flop_count": 0, "val/train_total_time": 3812.7206143867224, "val/train_update_time": 3476.8136685648933, "val/loss": 4.37912868698081, "val/val_token_count": 2048000, "val/val_seq_count": 1000, "val/val_time": 6.211853195913136, "val/val_tokens_per_second": 329692.2730478253, "val/loss_avg_len_2048": 4.37912868698081, "val/perplexity_len_2048": 79.76849978432571, "val/loss_avg_len_1024": 4.394532114933245, "val/perplexity_len_1024": 81.00672005987042, "val/loss_avg_len_512": 4.422331890467555, "val/perplexity_len_512": 83.29028289665668}
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 73.29413136799121, "val/train_update_time": 72.96587482298492, "val/loss": 8.066625093784602, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.72365917899879, "val/val_tokens_per_second": 489228.4976750561, "val/loss_avg_len_2048": 8.066625093784602, "val/perplexity_len_2048": 3186.3300972596303, "val/loss_avg_len_1024": 8.063385806354136, "val/perplexity_len_1024": 3176.0253572442853, "val/loss_avg_len_512": 8.063155076403358, "val/perplexity_len_512": 3175.2926376033747}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 226.53613719300483, "val/train_update_time": 142.21563291802886, "val/loss": 7.245292661319813, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.87901507099741, "val/val_tokens_per_second": 488322.3767628933, "val/loss_avg_len_2048": 7.245292661319813, "val/perplexity_len_2048": 1401.4919984598723, "val/loss_avg_len_1024": 7.242302788104816, "val/perplexity_len_1024": 1397.3079730422762, "val/loss_avg_len_512": 7.245893973641005, "val/perplexity_len_512": 1402.334986290682}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 379.9190621979942, "val/train_update_time": 211.44736236400786, "val/loss": 6.768975664408669, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.94667539399234, "val/val_tokens_per_second": 487928.7929838769, "val/loss_avg_len_2048": 6.768975664408669, "val/perplexity_len_2048": 870.4198351684087, "val/loss_avg_len_1024": 6.765464797357005, "val/perplexity_len_1024": 867.369265054543, "val/loss_avg_len_512": 6.77120095622167, "val/perplexity_len_512": 872.3589300272525}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 533.3516867249855, "val/train_update_time": 280.6721391470637, "val/loss": 6.397165876515117, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.74760113700177, "val/val_tokens_per_second": 489088.6359000778, "val/loss_avg_len_2048": 6.397165876515117, "val/perplexity_len_2048": 600.141749520851, "val/loss_avg_len_1024": 6.395533386002807, "val/perplexity_len_1024": 599.162823070194, "val/loss_avg_len_512": 6.406499122676719, "val/perplexity_len_512": 605.7692407307555}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 686.6137485890067, "val/train_update_time": 349.9022945231409, "val/loss": 6.105244160742686, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.8132187850133, "val/val_tokens_per_second": 488705.726778794, "val/loss_avg_len_2048": 6.105244160742686, "val/perplexity_len_2048": 448.2020614450007, "val/loss_avg_len_1024": 6.106694530989509, "val/perplexity_len_1024": 448.85259202054465, "val/loss_avg_len_512": 6.122553787205275, "val/perplexity_len_512": 456.0278066748302}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 840.405497522006, "val/train_update_time": 419.1452622152283, "val/loss": 5.87325499422655, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.90133277099812, "val/val_tokens_per_second": 488192.48332797043, "val/loss_avg_len_2048": 5.87325499422655, "val/perplexity_len_2048": 355.4039373228419, "val/loss_avg_len_1024": 5.876565324747097, "val/perplexity_len_1024": 356.5823912835879, "val/loss_avg_len_512": 5.895211068405025, "val/perplexity_len_512": 363.2935075883324}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 993.8301178629918, "val/train_update_time": 488.4094896201277, "val/loss": 5.695340925325057, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.15111422500922, "val/val_tokens_per_second": 492597.1273116209, "val/loss_avg_len_2048": 5.695340925325057, "val/perplexity_len_2048": 297.4781941501539, "val/loss_avg_len_1024": 5.700811359035084, "val/perplexity_len_1024": 299.10998813246323, "val/loss_avg_len_512": 5.721176919134427, "val/perplexity_len_512": 305.26398264558094}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1146.4763450330065, "val/train_update_time": 557.6233584931178, "val/loss": 5.5228234780823815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.12564284997643, "val/val_tokens_per_second": 492748.0690155242, "val/loss_avg_len_2048": 5.5228234780823815, "val/perplexity_len_2048": 250.34087223176758, "val/loss_avg_len_1024": 5.5305087140662135, "val/perplexity_len_1024": 252.27221280945514, "val/loss_avg_len_512": 5.553297413158045, "val/perplexity_len_512": 258.08717440915774}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1299.0948956199863, "val/train_update_time": 626.843196902104, "val/loss": 5.387703346484853, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.26296444798936, "val/val_tokens_per_second": 491935.4033519414, "val/loss_avg_len_2048": 5.387703346484853, "val/perplexity_len_2048": 218.70052899266585, "val/loss_avg_len_1024": 5.396013065259485, "val/perplexity_len_1024": 220.52544063467434, "val/loss_avg_len_512": 5.419521161240525, "val/perplexity_len_512": 225.77098871461365}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1452.0386730069877, "val/train_update_time": 696.2396770050982, "val/loss": 5.267773023286882, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.21875834499951, "val/val_tokens_per_second": 492196.7212030774, "val/loss_avg_len_2048": 5.267773023286882, "val/perplexity_len_2048": 193.9834843624152, "val/loss_avg_len_1024": 5.2777240617804235, "val/perplexity_len_1024": 195.92345784933923, "val/loss_avg_len_512": 5.30280932833273, "val/perplexity_len_512": 200.9004131536537}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 1605.2216201199917, "val/train_update_time": 765.4900175441289, "val/loss": 5.168812291965983, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.11411714999122, "val/val_tokens_per_second": 492816.3999634607, "val/loss_avg_len_2048": 5.168812291965983, "val/perplexity_len_2048": 175.70602604634044, "val/loss_avg_len_1024": 5.179184607096157, "val/perplexity_len_1024": 177.53798874168763, "val/loss_avg_len_512": 5.204846120559331, "val/perplexity_len_512": 182.15284103478766}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 1757.860138426011, "val/train_update_time": 834.7357104731782, "val/loss": 5.07917964521309, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.13903681200463, "val/val_tokens_per_second": 492668.6857416863, "val/loss_avg_len_2048": 5.07917964521309, "val/perplexity_len_2048": 160.64221824644585, "val/loss_avg_len_1024": 5.090464194297372, "val/perplexity_len_1024": 162.46525999969987, "val/loss_avg_len_512": 5.1169825646744105, "val/perplexity_len_512": 166.8312069871893}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 1910.5157937250042, "val/train_update_time": 903.9701039092615, "val/loss": 5.006576894561109, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.17826742198667, "val/val_tokens_per_second": 492436.32104283245, "val/loss_avg_len_2048": 5.006576894561109, "val/perplexity_len_2048": 149.3924736958235, "val/loss_avg_len_1024": 5.018725027570501, "val/perplexity_len_1024": 151.21838156759458, "val/loss_avg_len_512": 5.045682267305069, "val/perplexity_len_512": 155.3502534484518}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2063.1963952730002, "val/train_update_time": 973.2086714253237, "val/loss": 4.939421573723574, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.11001823598053, "val/val_tokens_per_second": 492840.70524084335, "val/loss_avg_len_2048": 4.939421573723574, "val/perplexity_len_2048": 139.6894261524396, "val/loss_avg_len_1024": 4.951981827717834, "val/perplexity_len_1024": 141.45502580357572, "val/loss_avg_len_512": 4.979078589006793, "val/perplexity_len_512": 145.3404016939073}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 2215.8320906269946, "val/train_update_time": 1042.4422394274152, "val/loss": 4.880953582810099, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.08043931302382, "val/val_tokens_per_second": 493016.17009599815, "val/loss_avg_len_2048": 4.880953582810099, "val/perplexity_len_2048": 131.75624449043048, "val/loss_avg_len_1024": 4.8935686632201545, "val/perplexity_len_1024": 133.42888819459714, "val/loss_avg_len_512": 4.920950583540276, "val/perplexity_len_512": 137.13290753355847}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 2368.8792440719844, "val/train_update_time": 1111.6830330224184, "val/loss": 4.8325701735513285, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.09534997900482, "val/val_tokens_per_second": 492927.70305858395, "val/loss_avg_len_2048": 4.8325701735513285, "val/perplexity_len_2048": 125.53318846749659, "val/loss_avg_len_1024": 4.8456610950137025, "val/perplexity_len_1024": 127.18733712083956, "val/loss_avg_len_512": 4.873299267485551, "val/perplexity_len_512": 130.75159052334683}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 2521.492483378999, "val/train_update_time": 1180.931988580327, "val/loss": 4.789877079297254, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.1720448200067, "val/val_tokens_per_second": 492473.1631720955, "val/loss_avg_len_2048": 4.789877079297254, "val/perplexity_len_2048": 120.28658204324753, "val/loss_avg_len_1024": 4.803124652416493, "val/perplexity_len_1024": 121.89068913738296, "val/loss_avg_len_512": 4.830635423418787, "val/perplexity_len_512": 125.29054791458057}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 2674.1708125350124, "val/train_update_time": 1250.16888089836, "val/loss": 4.743234224908077, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.35634245700203, "val/val_tokens_per_second": 491384.3241278074, "val/loss_avg_len_2048": 4.743234224908077, "val/perplexity_len_2048": 114.80490677425529, "val/loss_avg_len_1024": 4.7571246879515705, "val/perplexity_len_1024": 116.41072706320881, "val/loss_avg_len_512": 4.7850962978590275, "val/perplexity_len_512": 119.71289062387633}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 2827.0179851859866, "val/train_update_time": 1319.3769752274675, "val/loss": 4.708184829720389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.28989867499331, "val/val_tokens_per_second": 491776.3216381208, "val/loss_avg_len_2048": 4.708184829720389, "val/perplexity_len_2048": 110.85076416463089, "val/loss_avg_len_1024": 4.722111135172844, "val/perplexity_len_1024": 112.40530514816416, "val/loss_avg_len_512": 4.750208974112105, "val/perplexity_len_512": 115.6084411743891}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 2979.819751534, "val/train_update_time": 1388.5897308025742, "val/loss": 4.67620419087892, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.66028405699763, "val/val_tokens_per_second": 489599.1026290804, "val/loss_avg_len_2048": 4.67620419087892, "val/perplexity_len_2048": 107.36177335014543, "val/loss_avg_len_1024": 4.690579603694846, "val/perplexity_len_1024": 108.91628980025381, "val/loss_avg_len_512": 4.718876949522924, "val/perplexity_len_512": 112.04235276734202}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 3133.447895410005, "val/train_update_time": 1457.8368608065357, "val/loss": 4.64510413077313, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.88997037999798, "val/val_tokens_per_second": 488258.60605818214, "val/loss_avg_len_2048": 4.64510413077313, "val/perplexity_len_2048": 104.07420254835952, "val/loss_avg_len_1024": 4.659685420518555, "val/perplexity_len_1024": 105.60285643905125, "val/loss_avg_len_512": 4.688136402057856, "val/perplexity_len_512": 108.65051015156381}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 3286.8489470180066, "val/train_update_time": 1527.068307258567, "val/loss": 4.622256983703119, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.67413021399989, "val/val_tokens_per_second": 489518.0851625608, "val/loss_avg_len_2048": 4.622256983703119, "val/perplexity_len_2048": 101.72336120466873, "val/loss_avg_len_1024": 4.637227634538524, "val/perplexity_len_1024": 103.25768236429546, "val/loss_avg_len_512": 4.66592828974668, "val/perplexity_len_512": 106.26418340294687}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 3440.121589771996, "val/train_update_time": 1596.3996783556067, "val/loss": 4.594907666088967, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.08767745201476, "val/val_tokens_per_second": 492973.2212536021, "val/loss_avg_len_2048": 4.594907666088967, "val/perplexity_len_2048": 98.97899600791816, "val/loss_avg_len_1024": 4.609972692025034, "val/perplexity_len_1024": 100.48140565521302, "val/loss_avg_len_512": 4.6386030704602605, "val/perplexity_len_512": 103.39980440729035}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 3592.7624590400083, "val/train_update_time": 1665.6897427196673, "val/loss": 4.575895742433099, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.21338147099596, "val/val_tokens_per_second": 492228.52473885607, "val/loss_avg_len_2048": 4.575895742433099, "val/perplexity_len_2048": 97.1149902058559, "val/loss_avg_len_1024": 4.590654579546535, "val/perplexity_len_1024": 98.55892370715293, "val/loss_avg_len_512": 4.619258114893082, "val/perplexity_len_512": 101.41876314259017}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 3745.514067212993, "val/train_update_time": 1734.9429978527187, "val/loss": 4.556744415998249, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.29024019200006, "val/val_tokens_per_second": 491774.30519565445, "val/loss_avg_len_2048": 4.556744415998249, "val/perplexity_len_2048": 95.27280576944895, "val/loss_avg_len_1024": 4.572066484152572, "val/perplexity_len_1024": 96.74382292667758, "val/loss_avg_len_512": 4.601123036310449, "val/perplexity_len_512": 99.59610289954561}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 3898.7206719240057, "val/train_update_time": 1804.14956625973, "val/loss": 4.5398546514194225, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.14271029800875, "val/val_tokens_per_second": 492646.9182107115, "val/loss_avg_len_2048": 4.5398546514194225, "val/perplexity_len_2048": 93.67718328428124, "val/loss_avg_len_1024": 4.555266555744037, "val/perplexity_len_1024": 95.1321098665439, "val/loss_avg_len_512": 4.584480615995732, "val/perplexity_len_512": 97.95229907070892}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 4051.3472030170087, "val/train_update_time": 1873.3693814776198, "val/loss": 4.519077338180738, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.45039679700858, "val/val_tokens_per_second": 490830.50017885934, "val/loss_avg_len_2048": 4.519077338180738, "val/perplexity_len_2048": 91.75090385627789, "val/loss_avg_len_1024": 4.534679728145711, "val/perplexity_len_1024": 93.19366321696087, "val/loss_avg_len_512": 4.56380435053194, "val/perplexity_len_512": 95.94780546420296}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 4204.282005440997, "val/train_update_time": 1942.5947593615856, "val/loss": 4.504149057779786, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.98285058300826, "val/val_tokens_per_second": 487718.6201189412, "val/loss_avg_len_2048": 4.504149057779786, "val/perplexity_len_2048": 90.39139346022084, "val/loss_avg_len_1024": 4.519979343756846, "val/perplexity_len_1024": 91.83370101930686, "val/loss_avg_len_512": 4.549490065394714, "val/perplexity_len_512": 94.58416427684044}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 4357.762582869007, "val/train_update_time": 2011.834883902513, "val/loss": 4.487737312592589, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.38956990098814, "val/val_tokens_per_second": 491188.52691809647, "val/loss_avg_len_2048": 4.487737312592589, "val/perplexity_len_2048": 88.9200198733193, "val/loss_avg_len_1024": 4.503381963831792, "val/perplexity_len_1024": 90.32208135719293, "val/loss_avg_len_512": 4.532881353686472, "val/perplexity_len_512": 93.02621672414543}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 4510.643669799989, "val/train_update_time": 2081.064080615528, "val/loss": 4.477823902255832, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.90338259798591, "val/val_tokens_per_second": 488180.5563937209, "val/loss_avg_len_2048": 4.477823902255832, "val/perplexity_len_2048": 88.0428741653243, "val/loss_avg_len_1024": 4.493580668732151, "val/perplexity_len_1024": 89.44113225722718, "val/loss_avg_len_512": 4.523143095735833, "val/perplexity_len_512": 92.12470015446381}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 4664.52624743199, "val/train_update_time": 2150.3571494565113, "val/loss": 4.464658180295001, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.58210265901289, "val/val_tokens_per_second": 490057.0660097311, "val/loss_avg_len_2048": 4.464658180295001, "val/perplexity_len_2048": 86.89132329645581, "val/loss_avg_len_1024": 4.480764351781691, "val/perplexity_len_1024": 88.30214078631853, "val/loss_avg_len_512": 4.510652843934484, "val/perplexity_len_512": 90.98119564814961}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 4817.616289013007, "val/train_update_time": 2219.6062358425115, "val/loss": 4.4529901164986425, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.61640725701, "val/val_tokens_per_second": 489856.0144314992, "val/loss_avg_len_2048": 4.4529901164986425, "val/perplexity_len_2048": 85.88336170854886, "val/loss_avg_len_1024": 4.469235860132379, "val/perplexity_len_1024": 87.28999575640728, "val/loss_avg_len_512": 4.499290181878302, "val/perplexity_len_512": 89.95325818129203}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 4970.727602478, "val/train_update_time": 2288.853339902591, "val/loss": 4.443838785698568, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.69381960498868, "val/val_tokens_per_second": 489402.9235769104, "val/loss_avg_len_2048": 4.443838785698568, "val/perplexity_len_2048": 85.10099994102198, "val/loss_avg_len_1024": 4.460096235682024, "val/perplexity_len_1024": 86.49583268126037, "val/loss_avg_len_512": 4.49035777461771, "val/perplexity_len_512": 89.15333697445978}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 5123.909539049986, "val/train_update_time": 2358.095918665669, "val/loss": 4.434908325502579, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.67913438100368, "val/val_tokens_per_second": 489488.8110756854, "val/loss_avg_len_2048": 4.434908325502579, "val/perplexity_len_2048": 84.3443923040809, "val/loss_avg_len_1024": 4.451088077992527, "val/perplexity_len_1024": 85.72016350138246, "val/loss_avg_len_512": 4.481291017688159, "val/perplexity_len_512": 88.34865876197952}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 5277.0870255730115, "val/train_update_time": 2427.330438608711, "val/loss": 4.426706585733639, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.18017689298722, "val/val_tokens_per_second": 492425.01675243815, "val/loss_avg_len_2048": 4.426706585733639, "val/perplexity_len_2048": 83.65545066943865, "val/loss_avg_len_1024": 4.443149887683336, "val/perplexity_len_1024": 85.04239422006852, "val/loss_avg_len_512": 4.473612986021303, "val/perplexity_len_512": 87.67291248266899}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 5430.249512759008, "val/train_update_time": 2496.5911681566795, "val/loss": 4.419161863370403, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.169069185009, "val/val_tokens_per_second": 492490.78294822294, "val/loss_avg_len_2048": 4.419161863370403, "val/perplexity_len_2048": 83.02666849609794, "val/loss_avg_len_1024": 4.4357142435611685, "val/perplexity_len_1024": 84.41239437134591, "val/loss_avg_len_512": 4.466251606992445, "val/perplexity_len_512": 87.02988861829601}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 5582.925107155985, "val/train_update_time": 2565.8480685587565, "val/loss": 4.412361895313021, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.22933418498724, "val/val_tokens_per_second": 492134.17842513864, "val/loss_avg_len_2048": 4.412361895313021, "val/perplexity_len_2048": 82.46400501736277, "val/loss_avg_len_1024": 4.428987180554634, "val/perplexity_len_1024": 83.8464525731744, "val/loss_avg_len_512": 4.459533594012447, "val/perplexity_len_512": 86.44718020978645}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 5735.700029758998, "val/train_update_time": 2635.1292617027066, "val/loss": 4.406846508690622, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.27378682699054, "val/val_tokens_per_second": 491871.4707318212, "val/loss_avg_len_2048": 4.406846508690622, "val/perplexity_len_2048": 82.0104361010047, "val/loss_avg_len_1024": 4.423472210842371, "val/perplexity_len_1024": 83.38531467630756, "val/loss_avg_len_512": 4.45401080738604, "val/perplexity_len_512": 85.97106682551633}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 5888.469226193993, "val/train_update_time": 2704.3640278247476, "val/loss": 4.401876839681971, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.7315334439918, "val/val_tokens_per_second": 489182.48974143335, "val/loss_avg_len_2048": 4.401876839681971, "val/perplexity_len_2048": 81.60388243364973, "val/loss_avg_len_1024": 4.418596094172914, "val/perplexity_len_1024": 82.97970785018886, "val/loss_avg_len_512": 4.449346278101858, "val/perplexity_len_512": 85.5709860862857}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 6041.680216562003, "val/train_update_time": 2773.58810844674, "val/loss": 4.3977887128185715, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.1582510540029, "val/val_tokens_per_second": 492554.85151317826, "val/loss_avg_len_2048": 4.3977887128185715, "val/perplexity_len_2048": 81.27095639533198, "val/loss_avg_len_1024": 4.414443706510216, "val/perplexity_len_1024": 82.63585832737921, "val/loss_avg_len_512": 4.445159159805254, "val/perplexity_len_512": 85.21343931247905}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 6194.783387269999, "val/train_update_time": 2842.820589903771, "val/loss": 4.393867793466989, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.19375617799233, "val/val_tokens_per_second": 492344.6407728776, "val/loss_avg_len_2048": 4.393867793466989, "val/perplexity_len_2048": 80.95292342793084, "val/loss_avg_len_1024": 4.410636982592754, "val/perplexity_len_1024": 82.32188441420018, "val/loss_avg_len_512": 4.441507733502612, "val/perplexity_len_512": 84.90285609977245}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 6347.4543838310055, "val/train_update_time": 2912.0468749527645, "val/loss": 4.390602651420748, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.35589144201367, "val/val_tokens_per_second": 491386.9828684362, "val/loss_avg_len_2048": 4.390602651420748, "val/perplexity_len_2048": 80.68903169033332, "val/loss_avg_len_1024": 4.407449438422453, "val/perplexity_len_1024": 82.05989754063832, "val/loss_avg_len_512": 4.438302557166107, "val/perplexity_len_512": 84.6311631190514}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 6500.321636478999, "val/train_update_time": 2981.296865779761, "val/loss": 4.388172285466153, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.42446340899915, "val/val_tokens_per_second": 490983.0800971214, "val/loss_avg_len_2048": 4.388172285466153, "val/perplexity_len_2048": 80.49316592395871, "val/loss_avg_len_1024": 4.405054118391453, "val/perplexity_len_1024": 81.8635730481807, "val/loss_avg_len_512": 4.43599369634632, "val/perplexity_len_512": 84.43598694640616}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 6653.224929338001, "val/train_update_time": 3050.5236358706316, "val/loss": 4.386037354692467, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.96182978598517, "val/val_tokens_per_second": 487840.72601091646, "val/loss_avg_len_2048": 4.386037354692467, "val/perplexity_len_2048": 80.32150189756531, "val/loss_avg_len_1024": 4.402890470500942, "val/perplexity_len_1024": 81.68664057985016, "val/loss_avg_len_512": 4.433789289280586, "val/perplexity_len_512": 84.25006066407553}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 6806.725144091994, "val/train_update_time": 3119.802880719566, "val/loss": 4.384412398562184, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.91942880197894, "val/val_tokens_per_second": 488087.2115639818, "val/loss_avg_len_2048": 4.384412398562184, "val/perplexity_len_2048": 80.1910889670049, "val/loss_avg_len_1024": 4.401321436325087, "val/perplexity_len_1024": 81.55857194737422, "val/loss_avg_len_512": 4.432314658900629, "val/perplexity_len_512": 84.12591452243974}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 6960.596640536998, "val/train_update_time": 3189.047608219611, "val/loss": 4.383261151254853, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.85132250699098, "val/val_tokens_per_second": 488483.6490990946, "val/loss_avg_len_2048": 4.383261151254853, "val/perplexity_len_2048": 80.09882231281891, "val/loss_avg_len_1024": 4.400117161350698, "val/perplexity_len_1024": 81.46041211780742, "val/loss_avg_len_512": 4.431040566734597, "val/perplexity_len_512": 84.0187986060516}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 7113.936619379994, "val/train_update_time": 3258.2859199085797, "val/loss": 4.382433253489226, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.20848171401303, "val/val_tokens_per_second": 492257.5097666033, "val/loss_avg_len_2048": 4.382433253489226, "val/perplexity_len_2048": 80.03253611967847, "val/loss_avg_len_1024": 4.399324055639003, "val/perplexity_len_1024": 81.39583101288655, "val/loss_avg_len_512": 4.430300414302852, "val/perplexity_len_512": 83.9566348960995}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 7266.675624558004, "val/train_update_time": 3327.5515688046, "val/loss": 4.381986520714452, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 83.39674543499132, "val/val_tokens_per_second": 491146.2645976847, "val/loss_avg_len_2048": 4.381986520714452, "val/perplexity_len_2048": 79.99679094761, "val/loss_avg_len_1024": 4.398876096244017, "val/perplexity_len_1024": 81.35937715120615, "val/loss_avg_len_512": 4.429859576323722, "val/perplexity_len_512": 83.91963177962609}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 7419.518831352005, "val/train_update_time": 3396.797874707612, "val/loss": 4.3817854977080835, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 84.2198099019879, "val/val_tokens_per_second": 486346.38391689357, "val/loss_avg_len_2048": 4.3817854977080835, "val/perplexity_len_2048": 79.98071136843066, "val/loss_avg_len_1024": 4.398686848510337, "val/perplexity_len_1024": 81.34398153030583, "val/loss_avg_len_512": 4.4296826414794666, "val/perplexity_len_512": 83.90478478616119}
 
metrics/npz/train_eval/step-000000104857600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96d101f90ef1e56a6b4f4f318316a6208f38edecdaa412bcfbd8da4ec9661fbd
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a51d4164dc4390387d6c1f28c29dae3b0409dbb0029030a79fd2f136c3f09a9
3
  size 20540
metrics/npz/train_eval/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cfda294cd893636052c9b1891cf2d4c53be5566f3c461477b9dd007e4c0b1da
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c26aa18e5615d75520cc77b9a6e6759e29f757fb91b9f6477fbcbf5b62f1a7b
3
  size 20540
metrics/npz/train_eval/step-000000314572800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:596038064af19c26cf29a35967d8370b72ac239febad44c496e473e5243432f3
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090974fb978109d91b3ef21c3340b1831246b4958cd4a39650e67232878f33ce
3
  size 20540
metrics/npz/train_eval/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d86d423ded941d6856e13f192405296602d652276c3710e9dd012288db96e78c
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22e0ff82e135f1c1073aa676296905782a11112794e04b311fb947e963c80a6
3
  size 20540
metrics/npz/train_eval/step-000000524288000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02042f331cc991c7af80627a768260a8e9d1c22b59335bbc6ba0b14154b99135
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b1740d1b697ea2ad755ac9078c7101d8bae35c85f72e1805bb7dcd3521193a0
3
  size 20540
metrics/npz/train_eval/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c92abfcdf590936e0b24602e95a41034737f71e0c694d6021caeff864838717
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477545ebd381fa88a8679dcbae5f2ec198d76fd796eeb2ba0da1af34cf6745b1
3
  size 20540
metrics/npz/train_eval/step-000000734003200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c22f674223e29897d6843b4825bfb19bd1dd1f7c671b9ce16c0335d42f904b5b
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e598ddac14dbc0bf035727e4fb9df3bbb920f4094dcb20593c9f3ed4b8e76b8
3
  size 20540
metrics/npz/train_eval/step-000000838860800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6535fd69cd2341171dac801f6d7eb8665aa5de6b771c6d9722223e2393998f78
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6404280af516e7df0d4be476e4cf7bda1360a9c5041a2d1337be8aec15e1c4d
3
  size 20540
metrics/npz/train_eval/step-000000943718400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70b714daad59f2ee28166252e08317cd89b9b70a2a4b580e81f151379a5f87c9
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a63e4c563cfbe723eb97b8cc7a222f56e7f9cde5da3ded66fc315b34474b3f6c
3
  size 20540
metrics/npz/train_eval/step-000001048576000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cf52d7e914fa3ca97b1628367e6ff5a2eadec957077c3268e37261187190fd3
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db2d799619ba478ffb8786553e755d9ecb3494bdd381f91242c3e61221490935
3
  size 20540
metrics/npz/train_eval/step-000001153433600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f148ce0ff141ff7cc0da60e8daf7bffb0532fc1917778bb5b16dc9cddd14a71
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0977e97fa4947a5aaaa298256d5fa703442176c676c051c418ba57ca073f51a1
3
  size 20540
metrics/npz/train_eval/step-000001258291200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1080933db8f7236aa34ebbb3182e7477f996bf1e4174f9c2ab8b280d07c7cfea
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97eb10f5b7dfe6acc615229f78746d04cf64204f7fdd3931988f54d1306027bf
3
  size 20540
metrics/npz/train_eval/step-000001363148800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ac97b23dbab9718f8fd7a6bd1fec4895767908327edd2106fe22ec5855db440
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:570a43ae3248f83ba6e02b684a5818cb77228e3d4ef091614c44d811e21f4eb7
3
  size 20540
metrics/npz/train_eval/step-000001468006400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b74d8079574b093b95ba48250a0ed1e049aa375fe3e715ca1e1a5a12843b5198
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53251c79c14aa7ef820f81660df31e266896dc496afd464579aa2b5cc8a8ad7
3
  size 20540
metrics/npz/train_eval/step-000001572864000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31c91d3f4cab39f4d587602866b30269bb31361060add04c2efe0843d451166e
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ede060d6102a48fdae1b6cfdd10408c952adcd49abeed642b89355fe2f5dc86
3
  size 20540
metrics/npz/train_eval/step-000001677721600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0fb587ee5b71e750ea58dff638639e58bf8214278a6f6ed9fca730e2cf518b4
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0960585fca32d99fe37c5872ed0a9b35642f905b3a5ebc6e66259c9672240f9
3
  size 20540
metrics/npz/train_eval/step-000001782579200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a4ba7253fa2d9b47c51718f81b1fd75e22c6dc8d5dd5d249516c144fe8aa6cb
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2edca23008bccebd5889eaa267684fc44029c93c3b03556af684ec08cbdeebb
3
  size 20540
metrics/npz/train_eval/step-000001887436800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c63bb8a7999c8b2acfc3e8b47956b70344a6046cda5c034df324566828ee572a
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfe593ddd3f47a00cc4ef99beb62b100f2b2210361fe4a870d55ae10580758aa
3
  size 20540
metrics/npz/train_eval/step-000001992294400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28e935339284c88da88c0cae112fc9c1502cd2d6275ac6d785097053611e161e
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b815f2619795c6ebcfafdf3be4ff18328df4ae04100471bacc90374f84c70d24
3
  size 20540
metrics/npz/val/step-000000041943040.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b534cb3102ef15fb4e9054d587475f4aee3867aafe9b5e085f6a32d8fffa7bb5
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:335edce24f2126c24a463d4572aeec86b71d9b561adcd09bc3441a009de956c0
3
  size 21142
metrics/npz/val/step-000000083886080.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:736b63b4ab20c592cd78b448df8930c1cfdfb8db332fc7cb0f5fb30078efbf4d
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dd23142abe5f7181034e5f085d2ca167c68a5bce942b91ec27f3b1700f8dae1
3
  size 21142
metrics/npz/val/step-000000125829120.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b93b51d736f963e177a3733bbcbde6ff5a363e95561c2714d2f31de4ad49e54a
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17b5b83ca4e4583cfa0729d4e2064353a3950f7dca953ed87883f8e2435e2674
3
  size 21142
metrics/npz/val/step-000000167772160.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b0569e5fa2930d66177602a430aa91af1d0013bc11286f9cc60b7a51c5b827d
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fe685bbece2ec360cbbad3e38efb200d7be826f8b520629ed5e2d9b7ae53fc9
3
  size 21142
metrics/npz/val/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed15575179e8b370daf20e86309dfbf0de74c0b853d77fc7a3f9aa8087d6e4c7
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e89cda40363cdecf06af5c26dca9a0c739c8b55501bf57d69399306d3a7ee5
3
  size 21142
metrics/npz/val/step-000000251658240.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70426f112c6d3b12e3c2e6baa3692483b6da6c688347104596d78a48fb9b5fdc
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee710b87975ced627a329473c3cd1eb5320c76ec9e93720ae98401078caf4b5
3
  size 21142
metrics/npz/val/step-000000293601280.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbad611bcf7bf98c87a8194c9c3de04eac8c0b238c414294c9b3e6de683723c5
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df282db83c775378e697e10a8c6399e8b8b34073def39031df797483681bc21
3
  size 21142
metrics/npz/val/step-000000335544320.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b243cb6d27dadf0d0bc4889454bb811d67821afdc307400f0a155a9240fa1b4c
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36c64e84ddedb00766216c9b3cee5c66be139f2ead3493e6c1e97c88391fde65
3
  size 21142
metrics/npz/val/step-000000377487360.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9df89fab07b311ebec553cf6f7cd5d74f3add1075768d060e5f37a8a2d5a660f
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:926fa1d5c599f6e300195d68e2d8534512f27ca9bc643addef9693fe87bb4521
3
  size 21142
metrics/npz/val/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ead713b21ddbd16aa5b1d625edfa860b022e1cd215b8d11ebe659ee6d05f98a
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:591381c3c1404825232d0b329093400130055d7a8db1bb43c0cdae6dcdde7bdf
3
  size 21142
metrics/npz/val/step-000000461373440.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c7fdedfa1de5cf90523195295432b96127e471383a6bedb796c016c8d5ca38d
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9782a956b1ee686d4cbbe9e0818204b1b7ca1247440c498e6c029b9ce6be2bd3
3
  size 21142