Lanni-ni commited on
Commit
e83f87e
·
verified ·
1 Parent(s): 4db6d3c

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoints/step-000000209715200.pt +1 -1
  2. checkpoints/step-000000419430400.pt +1 -1
  3. checkpoints/step-000000629145600.pt +1 -1
  4. checkpoints/step-000000838860800.pt +1 -1
  5. checkpoints/step-000001048576000.pt +1 -1
  6. checkpoints/step-000001258291200.pt +1 -1
  7. checkpoints/step-000001468006400.pt +1 -1
  8. checkpoints/step-000001677721600.pt +1 -1
  9. checkpoints/step-000001887436800.pt +1 -1
  10. logs/2025-10-25_23-30-02.log +336 -0
  11. metrics/jsonlines/checkpoint.jsonl +9 -9
  12. metrics/jsonlines/throughput.jsonl +0 -0
  13. metrics/jsonlines/train.jsonl +98 -98
  14. metrics/jsonlines/train_eval.jsonl +19 -19
  15. metrics/jsonlines/val.jsonl +49 -49
  16. metrics/npz/train_eval/step-000000104857600.npz +1 -1
  17. metrics/npz/train_eval/step-000000209715200.npz +1 -1
  18. metrics/npz/train_eval/step-000000314572800.npz +1 -1
  19. metrics/npz/train_eval/step-000000419430400.npz +1 -1
  20. metrics/npz/train_eval/step-000000524288000.npz +1 -1
  21. metrics/npz/train_eval/step-000000629145600.npz +1 -1
  22. metrics/npz/train_eval/step-000000734003200.npz +1 -1
  23. metrics/npz/train_eval/step-000000838860800.npz +1 -1
  24. metrics/npz/train_eval/step-000000943718400.npz +1 -1
  25. metrics/npz/train_eval/step-000001048576000.npz +1 -1
  26. metrics/npz/train_eval/step-000001153433600.npz +1 -1
  27. metrics/npz/train_eval/step-000001258291200.npz +1 -1
  28. metrics/npz/train_eval/step-000001363148800.npz +1 -1
  29. metrics/npz/train_eval/step-000001468006400.npz +1 -1
  30. metrics/npz/train_eval/step-000001572864000.npz +1 -1
  31. metrics/npz/train_eval/step-000001677721600.npz +1 -1
  32. metrics/npz/train_eval/step-000001782579200.npz +1 -1
  33. metrics/npz/train_eval/step-000001887436800.npz +1 -1
  34. metrics/npz/train_eval/step-000001992294400.npz +1 -1
  35. metrics/npz/val/step-000000041943040.npz +1 -1
  36. metrics/npz/val/step-000000083886080.npz +1 -1
  37. metrics/npz/val/step-000000125829120.npz +1 -1
  38. metrics/npz/val/step-000000167772160.npz +1 -1
  39. metrics/npz/val/step-000000209715200.npz +1 -1
  40. metrics/npz/val/step-000000251658240.npz +1 -1
  41. metrics/npz/val/step-000000293601280.npz +1 -1
  42. metrics/npz/val/step-000000335544320.npz +1 -1
  43. metrics/npz/val/step-000000377487360.npz +1 -1
  44. metrics/npz/val/step-000000419430400.npz +1 -1
  45. metrics/npz/val/step-000000461373440.npz +1 -1
  46. metrics/npz/val/step-000000503316480.npz +1 -1
  47. metrics/npz/val/step-000000545259520.npz +1 -1
  48. metrics/npz/val/step-000000587202560.npz +1 -1
  49. metrics/npz/val/step-000000629145600.npz +1 -1
  50. metrics/npz/val/step-000000671088640.npz +1 -1
checkpoints/step-000000209715200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3868b3ff12d7606779d8c9ef0d2e601fff4ca109e4fe5ee189a2a88f98e0d668
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c76bafdf260002e33ccc2496056b2ec692050dffb45c2ab145deda8b0c47e849
3
  size 329410370
checkpoints/step-000000419430400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d3c52b53ad1e8f8f016b4fb337c67dac4db0ed98bdedd7b5b4c4a190a9a6760
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c48a7b84490d2d0075b35bb0e305586cadb5fbcec67d648b918f38745f445831
3
  size 329410370
checkpoints/step-000000629145600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eebf356a4a3371aaadac1b72df2be12bd08528c1891780e11100cb9ab1757174
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9118a021df5fba9797e8ea3041dfb88c3d060b28c2c2049c7c63003395da40a9
3
  size 329410370
checkpoints/step-000000838860800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3711d8e099bdebe58cf8d6bb088f7eb5502ef326a4c2611481f1d8f2f3bb265
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc5afe5e3a7c1be2237bc906cd232b7dcb544c5378e5a75ef84d5a558cece1a7
3
  size 329410370
checkpoints/step-000001048576000.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:392903cf6a2c32be47babc2276a5f9ee1fa86a0702615c67cf014098a3df88a0
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dcf9d8d189648809a8d7edf99c39245f16b33442591cc11bc79462e4152357b
3
  size 329410370
checkpoints/step-000001258291200.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f46256e1ef9866e5b4e50466760df0d1ca72b7e1a9b32552e8496efb0b6920a1
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c16a5b164d28e547c299dffe98636483e32b8c04c0c926cf9c71d8e1d5215cf
3
  size 329410370
checkpoints/step-000001468006400.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c637ab66eb22621a5ce70a2d17c5fc6e2b7e3e160c62b1ebb39ca056ad711e9a
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7ac2eb93bb1ef7c0bc025c2855a12385ab5e1e3d5b34e96a0ccd15ae587a74e
3
  size 329410370
checkpoints/step-000001677721600.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7f3ae06a371dd0f45131ed310fa03b9b83c50423614058ecc231cefe047fd5e
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2884dd89eb6202c9f0b572e5eaaec3aea12732f0df686fe07dbebe031aad3c
3
  size 329410370
checkpoints/step-000001887436800.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5f34c367f3e1a4f877fd872fbe5951d30ef354080bb79bd4b7f712fb1d1a55d
3
  size 329410370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b36a74cfc772001dd3cb75c15e507d806eaaf651063a658b23c93a08c580b57c
3
  size 329410370
logs/2025-10-25_23-30-02.log ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-25 23:30:02][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256`
2
+ [2025-10-25 23:30:02][train:375][INFO] Configuration:
3
+ [2025-10-25 23:30:02][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml.
4
+ [2025-10-25 23:30:02][train:387][INFO] creating datamodule
5
+ [2025-10-25 23:30:02][train:419][INFO] creating model
6
+ [2025-10-25 23:30:03][train:440][INFO] creating optimizer
7
+ [2025-10-25 23:30:03][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-25 23:30:03][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-25 23:30:03][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-25 23:30:04][logger:288][INFO] wandb initialized. Run id: t14066ds
11
+ [2025-10-25 23:30:04][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/resume.jsonl since we are not resuming
13
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train_data_info.jsonl since we are not resuming
14
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/val_data_info.jsonl since we are not resuming
15
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/model_info.jsonl since we are not resuming
16
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train.jsonl since we are not resuming
17
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/throughput.jsonl since we are not resuming
18
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/norm.jsonl since we are not resuming
19
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/val.jsonl since we are not resuming
20
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train_eval.jsonl since we are not resuming
21
+ [2025-10-25 23:30:04][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/checkpoint.jsonl since we are not resuming
22
+ [2025-10-25 23:30:04][logger:113][INFO] Setting up npz logger...
23
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000041943040.npz since we are not resuming
24
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000083886080.npz since we are not resuming
25
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000125829120.npz since we are not resuming
26
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000167772160.npz since we are not resuming
27
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000209715200.npz since we are not resuming
28
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000251658240.npz since we are not resuming
29
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000293601280.npz since we are not resuming
30
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000335544320.npz since we are not resuming
31
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000377487360.npz since we are not resuming
32
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000419430400.npz since we are not resuming
33
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000461373440.npz since we are not resuming
34
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000503316480.npz since we are not resuming
35
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000545259520.npz since we are not resuming
36
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000587202560.npz since we are not resuming
37
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000629145600.npz since we are not resuming
38
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000671088640.npz since we are not resuming
39
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000713031680.npz since we are not resuming
40
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000754974720.npz since we are not resuming
41
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000796917760.npz since we are not resuming
42
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000838860800.npz since we are not resuming
43
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000880803840.npz since we are not resuming
44
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000922746880.npz since we are not resuming
45
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000000964689920.npz since we are not resuming
46
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001006632960.npz since we are not resuming
47
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001048576000.npz since we are not resuming
48
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001090519040.npz since we are not resuming
49
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001132462080.npz since we are not resuming
50
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001174405120.npz since we are not resuming
51
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001216348160.npz since we are not resuming
52
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001258291200.npz since we are not resuming
53
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001300234240.npz since we are not resuming
54
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001342177280.npz since we are not resuming
55
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001384120320.npz since we are not resuming
56
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001426063360.npz since we are not resuming
57
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001468006400.npz since we are not resuming
58
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001509949440.npz since we are not resuming
59
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001551892480.npz since we are not resuming
60
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001593835520.npz since we are not resuming
61
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001635778560.npz since we are not resuming
62
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001677721600.npz since we are not resuming
63
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001719664640.npz since we are not resuming
64
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001761607680.npz since we are not resuming
65
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001803550720.npz since we are not resuming
66
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001845493760.npz since we are not resuming
67
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001887436800.npz since we are not resuming
68
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001929379840.npz since we are not resuming
69
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000001971322880.npz since we are not resuming
70
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000002013265920.npz since we are not resuming
71
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/val/step-000002055208960.npz since we are not resuming
72
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000104857600.npz since we are not resuming
73
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000209715200.npz since we are not resuming
74
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000314572800.npz since we are not resuming
75
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000419430400.npz since we are not resuming
76
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000524288000.npz since we are not resuming
77
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000629145600.npz since we are not resuming
78
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000734003200.npz since we are not resuming
79
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000838860800.npz since we are not resuming
80
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000000943718400.npz since we are not resuming
81
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001048576000.npz since we are not resuming
82
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001153433600.npz since we are not resuming
83
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001258291200.npz since we are not resuming
84
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001363148800.npz since we are not resuming
85
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001468006400.npz since we are not resuming
86
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001572864000.npz since we are not resuming
87
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001677721600.npz since we are not resuming
88
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001782579200.npz since we are not resuming
89
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001887436800.npz since we are not resuming
90
+ [2025-10-25 23:30:04][logger:127][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/npz/train_eval/step-000001992294400.npz since we are not resuming
91
+ [2025-10-25 23:30:04][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
92
+ [2025-10-25 23:30:04][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
93
+ [2025-10-25 23:30:04][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128]
94
+ [2025-10-25 23:31:17][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:01:13] [ETA: 2:00:37] [loss: 9.762] [tokens/s: 302926.081] [batches/s: 0.144] [MFU: 0.000] [TFLOPS: 0.000]
95
+ [2025-10-25 23:32:26][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:02:22] [ETA: 1:56:11] [loss: 8.127] [tokens/s: 303066.016] [batches/s: 0.145] [MFU: 0.000] [TFLOPS: 0.000]
96
+ [2025-10-25 23:32:26][train:194][INFO] Running validation...
97
+ [2025-10-25 23:33:57][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 142.274] [val/train_update_time: 141.950] [val/loss: 8.017] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.296] [val/val_tokens_per_second: 453619.578] [val/loss_avg_len_2048: 8.017] [val/perplexity_len_2048: 3033.047] [val/loss_avg_len_1024: 8.016] [val/perplexity_len_1024: 3029.391] [val/loss_avg_len_512: 8.017] [val/perplexity_len_512: 3030.800]
98
+ [2025-10-25 23:35:06][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:05:01] [ETA: 2:42:35] [loss: 7.520] [tokens/s: 209051.322] [batches/s: 0.100] [MFU: 0.000] [TFLOPS: 0.000]
99
+ [2025-10-25 23:36:15][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:06:10] [ETA: 2:28:20] [loss: 7.193] [tokens/s: 227164.362] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000]
100
+ [2025-10-25 23:36:15][train:194][INFO] Running validation...
101
+ [2025-10-25 23:37:45][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 370.841] [val/train_update_time: 279.983] [val/loss: 7.169] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.330] [val/val_tokens_per_second: 453450.719] [val/loss_avg_len_2048: 7.169] [val/perplexity_len_2048: 1298.380] [val/loss_avg_len_1024: 7.169] [val/perplexity_len_1024: 1298.934] [val/loss_avg_len_512: 7.173] [val/perplexity_len_512: 1303.240]
102
+ [2025-10-25 23:38:54][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:08:50] [ETA: 2:47:55] [loss: 6.947] [tokens/s: 197807.885] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
103
+ [2025-10-25 23:38:54][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 530.295] [train_eval/train_update_time: 348.975] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.263] [train_eval/perplexity_len_2048: 3876.799] [train_eval/loss_avg_len_1024: 8.264] [train_eval/perplexity_len_1024: 3880.087] [train_eval/loss_avg_len_512: 8.264] [train_eval/perplexity_len_512: 3883.218]
104
+ [2025-10-25 23:40:03][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:09:59] [ETA: 2:36:30] [loss: 6.680] [tokens/s: 210212.973] [batches/s: 0.100] [MFU: 0.000] [TFLOPS: 0.000]
105
+ [2025-10-25 23:40:03][train:194][INFO] Running validation...
106
+ [2025-10-25 23:41:34][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 599.401] [val/train_update_time: 417.947] [val/loss: 6.680] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.348] [val/val_tokens_per_second: 453356.595] [val/loss_avg_len_2048: 6.680] [val/perplexity_len_2048: 796.683] [val/loss_avg_len_1024: 6.682] [val/perplexity_len_1024: 797.888] [val/loss_avg_len_512: 6.688] [val/perplexity_len_512: 802.766]
107
+ [2025-10-25 23:42:43][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:12:38] [ETA: 2:48:02] [loss: 6.480] [tokens/s: 193430.688] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000]
108
+ [2025-10-25 23:43:52][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:13:47] [ETA: 2:38:41] [loss: 6.282] [tokens/s: 202736.609] [batches/s: 0.097] [MFU: 0.000] [TFLOPS: 0.000]
109
+ [2025-10-25 23:43:52][train:194][INFO] Running validation...
110
+ [2025-10-25 23:45:22][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 827.992] [val/train_update_time: 555.907] [val/loss: 6.256] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.297] [val/val_tokens_per_second: 453613.474] [val/loss_avg_len_2048: 6.256] [val/perplexity_len_2048: 521.387] [val/loss_avg_len_1024: 6.259] [val/perplexity_len_1024: 522.894] [val/loss_avg_len_512: 6.268] [val/perplexity_len_512: 527.534]
111
+ [2025-10-25 23:46:31][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:16:27] [ETA: 2:46:24] [loss: 6.123] [tokens/s: 191111.243] [batches/s: 0.091] [MFU: 0.000] [TFLOPS: 0.000]
112
+ [2025-10-25 23:47:41][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:17:36] [ETA: 2:38:28] [loss: 5.972] [tokens/s: 198537.821] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
113
+ [2025-10-25 23:47:41][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1056.534] [train_eval/train_update_time: 693.881] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.399] [train_eval/perplexity_len_2048: 601.304] [train_eval/loss_avg_len_1024: 6.403] [train_eval/perplexity_len_1024: 603.875] [train_eval/loss_avg_len_512: 6.410] [train_eval/perplexity_len_512: 607.701]
114
+ [2025-10-25 23:47:41][train:194][INFO] Running validation...
115
+ [2025-10-25 23:49:11][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 1056.534] [val/train_update_time: 693.881] [val/loss: 5.960] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.285] [val/val_tokens_per_second: 453676.331] [val/loss_avg_len_2048: 5.960] [val/perplexity_len_2048: 387.490] [val/loss_avg_len_1024: 5.964] [val/perplexity_len_1024: 389.067] [val/loss_avg_len_512: 5.975] [val/perplexity_len_512: 393.380]
116
+ [2025-10-25 23:49:11][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt...
117
+ [2025-10-25 23:49:11][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt.
118
+ [2025-10-25 23:49:11][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.425]
119
+ [2025-10-25 23:50:20][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:20:16] [ETA: 2:44:01] [loss: 5.850] [tokens/s: 182711.177] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
120
+ [2025-10-25 23:51:29][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:21:25] [ETA: 2:37:06] [loss: 5.718] [tokens/s: 198492.689] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
121
+ [2025-10-25 23:51:29][train:194][INFO] Running validation...
122
+ [2025-10-25 23:53:00][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1285.478] [val/train_update_time: 831.841] [val/loss: 5.730] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.295] [val/val_tokens_per_second: 453622.098] [val/loss_avg_len_2048: 5.730] [val/perplexity_len_2048: 307.853] [val/loss_avg_len_1024: 5.735] [val/perplexity_len_1024: 309.410] [val/loss_avg_len_512: 5.747] [val/perplexity_len_512: 313.342]
123
+ [2025-10-25 23:54:09][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:24:04] [ETA: 2:41:09] [loss: 5.644] [tokens/s: 182721.034] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
124
+ [2025-10-25 23:55:18][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:25:14] [ETA: 2:35:00] [loss: 5.570] [tokens/s: 198502.780] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
125
+ [2025-10-25 23:55:18][train:194][INFO] Running validation...
126
+ [2025-10-25 23:56:49][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1514.021] [val/train_update_time: 969.814] [val/loss: 5.542] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.563] [val/val_tokens_per_second: 452284.018] [val/loss_avg_len_2048: 5.542] [val/perplexity_len_2048: 255.165] [val/loss_avg_len_1024: 5.548] [val/perplexity_len_1024: 256.708] [val/loss_avg_len_512: 5.562] [val/perplexity_len_512: 260.306]
127
+ [2025-10-25 23:57:58][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:27:53] [ETA: 2:38:04] [loss: 5.445] [tokens/s: 182680.714] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
128
+ [2025-10-25 23:57:58][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1673.720] [train_eval/train_update_time: 1038.809] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.693] [train_eval/perplexity_len_2048: 296.814] [train_eval/loss_avg_len_1024: 5.699] [train_eval/perplexity_len_1024: 298.566] [train_eval/loss_avg_len_512: 5.711] [train_eval/perplexity_len_512: 302.082]
129
+ [2025-10-25 23:59:07][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:29:02] [ETA: 2:32:29] [loss: 5.405] [tokens/s: 198459.104] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
130
+ [2025-10-25 23:59:07][train:194][INFO] Running validation...
131
+ [2025-10-26 00:00:37][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1742.828] [val/train_update_time: 1107.798] [val/loss: 5.396] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.408] [val/val_tokens_per_second: 453055.432] [val/loss_avg_len_2048: 5.396] [val/perplexity_len_2048: 220.467] [val/loss_avg_len_1024: 5.403] [val/perplexity_len_1024: 222.034] [val/loss_avg_len_512: 5.418] [val/perplexity_len_512: 225.426]
132
+ [2025-10-26 00:01:46][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:31:42] [ETA: 2:34:47] [loss: 5.298] [tokens/s: 182674.312] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
133
+ [2025-10-26 00:02:55][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:32:51] [ETA: 2:29:41] [loss: 5.267] [tokens/s: 198435.832] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
134
+ [2025-10-26 00:02:55][train:194][INFO] Running validation...
135
+ [2025-10-26 00:04:26][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1971.492] [val/train_update_time: 1245.788] [val/loss: 5.258] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.322] [val/val_tokens_per_second: 453486.621] [val/loss_avg_len_2048: 5.258] [val/perplexity_len_2048: 192.005] [val/loss_avg_len_1024: 5.266] [val/perplexity_len_1024: 193.543] [val/loss_avg_len_512: 5.282] [val/perplexity_len_512: 196.771]
136
+ [2025-10-26 00:05:35][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:35:30] [ETA: 2:31:24] [loss: 5.241] [tokens/s: 182665.333] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
137
+ [2025-10-26 00:06:44][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:36:40] [ETA: 2:26:40] [loss: 5.149] [tokens/s: 198504.975] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
138
+ [2025-10-26 00:06:44][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2200.083] [train_eval/train_update_time: 1383.789] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.297] [train_eval/perplexity_len_2048: 199.721] [train_eval/loss_avg_len_1024: 5.305] [train_eval/perplexity_len_1024: 201.409] [train_eval/loss_avg_len_512: 5.320] [train_eval/perplexity_len_512: 204.484]
139
+ [2025-10-26 00:06:44][train:194][INFO] Running validation...
140
+ [2025-10-26 00:08:15][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 2200.083] [val/train_update_time: 1383.789] [val/loss: 5.151] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.410] [val/val_tokens_per_second: 453049.139] [val/loss_avg_len_2048: 5.151] [val/perplexity_len_2048: 172.553] [val/loss_avg_len_1024: 5.159] [val/perplexity_len_1024: 174.054] [val/loss_avg_len_512: 5.177] [val/perplexity_len_512: 177.220]
141
+ [2025-10-26 00:08:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt...
142
+ [2025-10-26 00:08:15][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt.
143
+ [2025-10-26 00:08:15][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.421]
144
+ [2025-10-26 00:09:24][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:39:20] [ETA: 2:27:58] [loss: 5.102] [tokens/s: 182638.988] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
145
+ [2025-10-26 00:10:33][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:40:29] [ETA: 2:23:32] [loss: 5.073] [tokens/s: 198400.131] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
146
+ [2025-10-26 00:10:33][train:194][INFO] Running validation...
147
+ [2025-10-26 00:12:04][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2429.164] [val/train_update_time: 1521.777] [val/loss: 5.064] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.792] [val/val_tokens_per_second: 451142.338] [val/loss_avg_len_2048: 5.064] [val/perplexity_len_2048: 158.159] [val/loss_avg_len_1024: 5.073] [val/perplexity_len_1024: 159.659] [val/loss_avg_len_512: 5.092] [val/perplexity_len_512: 162.754]
148
+ [2025-10-26 00:13:13][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:43:09] [ETA: 2:24:27] [loss: 5.013] [tokens/s: 182557.417] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
149
+ [2025-10-26 00:14:22][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:44:18] [ETA: 2:20:17] [loss: 4.965] [tokens/s: 198351.326] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
150
+ [2025-10-26 00:14:22][train:194][INFO] Running validation...
151
+ [2025-10-26 00:15:53][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2658.232] [val/train_update_time: 1659.785] [val/loss: 4.985] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 91.045] [val/val_tokens_per_second: 449885.595] [val/loss_avg_len_2048: 4.985] [val/perplexity_len_2048: 146.276] [val/loss_avg_len_1024: 4.996] [val/perplexity_len_1024: 147.785] [val/loss_avg_len_512: 5.016] [val/perplexity_len_512: 150.831]
152
+ [2025-10-26 00:17:02][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:46:58] [ETA: 2:20:55] [loss: 4.962] [tokens/s: 182476.527] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
153
+ [2025-10-26 00:17:02][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2818.422] [train_eval/train_update_time: 1728.795] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.045] [train_eval/perplexity_len_2048: 155.292] [train_eval/loss_avg_len_1024: 5.053] [train_eval/perplexity_len_1024: 156.556] [train_eval/loss_avg_len_512: 5.071] [train_eval/perplexity_len_512: 159.272]
154
+ [2025-10-26 00:18:12][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:48:07] [ETA: 2:16:58] [loss: 4.912] [tokens/s: 198222.713] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
155
+ [2025-10-26 00:18:12][train:194][INFO] Running validation...
156
+ [2025-10-26 00:19:42][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2887.564] [val/train_update_time: 1797.818] [val/loss: 4.916] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.709] [val/val_tokens_per_second: 451552.828] [val/loss_avg_len_2048: 4.916] [val/perplexity_len_2048: 136.521] [val/loss_avg_len_1024: 4.927] [val/perplexity_len_1024: 137.983] [val/loss_avg_len_512: 4.948] [val/perplexity_len_512: 140.922]
157
+ [2025-10-26 00:20:51][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:50:47] [ETA: 2:17:19] [loss: 4.898] [tokens/s: 182422.584] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
158
+ [2025-10-26 00:22:01][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:51:56] [ETA: 2:13:33] [loss: 4.849] [tokens/s: 198153.306] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
159
+ [2025-10-26 00:22:01][train:194][INFO] Running validation...
160
+ [2025-10-26 00:23:31][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 3116.515] [val/train_update_time: 1935.813] [val/loss: 4.863] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.400] [val/val_tokens_per_second: 453094.867] [val/loss_avg_len_2048: 4.863] [val/perplexity_len_2048: 129.424] [val/loss_avg_len_1024: 4.874] [val/perplexity_len_1024: 130.908] [val/loss_avg_len_512: 4.897] [val/perplexity_len_512: 133.851]
161
+ [2025-10-26 00:24:40][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:54:36] [ETA: 2:13:40] [loss: 4.817] [tokens/s: 182416.353] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
162
+ [2025-10-26 00:25:49][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:55:45] [ETA: 2:10:05] [loss: 4.797] [tokens/s: 198238.394] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
163
+ [2025-10-26 00:25:49][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3345.162] [train_eval/train_update_time: 2073.822] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.881] [train_eval/perplexity_len_2048: 131.799] [train_eval/loss_avg_len_1024: 4.889] [train_eval/perplexity_len_1024: 132.870] [train_eval/loss_avg_len_512: 4.908] [train_eval/perplexity_len_512: 135.408]
164
+ [2025-10-26 00:25:49][train:194][INFO] Running validation...
165
+ [2025-10-26 00:27:19][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3345.162] [val/train_update_time: 2073.822] [val/loss: 4.812] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.229] [val/val_tokens_per_second: 453957.229] [val/loss_avg_len_2048: 4.812] [val/perplexity_len_2048: 122.919] [val/loss_avg_len_1024: 4.823] [val/perplexity_len_1024: 124.363] [val/loss_avg_len_512: 4.846] [val/perplexity_len_512: 127.252]
166
+ [2025-10-26 00:27:19][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt...
167
+ [2025-10-26 00:27:20][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt.
168
+ [2025-10-26 00:27:20][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.417]
169
+ [2025-10-26 00:28:29][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:58:24] [ETA: 2:10:01] [loss: 4.811] [tokens/s: 182446.882] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
170
+ [2025-10-26 00:29:38][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:59:34] [ETA: 2:06:34] [loss: 4.743] [tokens/s: 198268.603] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
171
+ [2025-10-26 00:29:38][train:194][INFO] Running validation...
172
+ [2025-10-26 00:31:09][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3574.040] [val/train_update_time: 2211.816] [val/loss: 4.761] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.520] [val/val_tokens_per_second: 452498.071] [val/loss_avg_len_2048: 4.761] [val/perplexity_len_2048: 116.815] [val/loss_avg_len_1024: 4.773] [val/perplexity_len_1024: 118.254] [val/loss_avg_len_512: 4.797] [val/perplexity_len_512: 121.108]
173
+ [2025-10-26 00:32:18][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:02:13] [ETA: 2:06:20] [loss: 4.760] [tokens/s: 182492.029] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
174
+ [2025-10-26 00:33:27][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:03:22] [ETA: 2:03:01] [loss: 4.725] [tokens/s: 198368.660] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
175
+ [2025-10-26 00:33:27][train:194][INFO] Running validation...
176
+ [2025-10-26 00:34:57][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3802.836] [val/train_update_time: 2349.827] [val/loss: 4.719] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.552] [val/val_tokens_per_second: 452336.201] [val/loss_avg_len_2048: 4.719] [val/perplexity_len_2048: 112.082] [val/loss_avg_len_1024: 4.732] [val/perplexity_len_1024: 113.528] [val/loss_avg_len_512: 4.757] [val/perplexity_len_512: 116.347]
177
+ [2025-10-26 00:36:07][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:06:02] [ETA: 2:02:38] [loss: 4.704] [tokens/s: 182574.869] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
178
+ [2025-10-26 00:36:07][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3962.515] [train_eval/train_update_time: 2418.827] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.753] [train_eval/perplexity_len_2048: 115.879] [train_eval/loss_avg_len_1024: 4.764] [train_eval/perplexity_len_1024: 117.193] [train_eval/loss_avg_len_512: 4.786] [train_eval/perplexity_len_512: 119.779]
179
+ [2025-10-26 00:37:16][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:07:11] [ETA: 1:59:27] [loss: 4.631] [tokens/s: 198404.891] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
180
+ [2025-10-26 00:37:16][train:194][INFO] Running validation...
181
+ [2025-10-26 00:38:46][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 4031.641] [val/train_update_time: 2487.827] [val/loss: 4.676] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.490] [val/val_tokens_per_second: 452647.181] [val/loss_avg_len_2048: 4.676] [val/perplexity_len_2048: 107.379] [val/loss_avg_len_1024: 4.690] [val/perplexity_len_1024: 108.835] [val/loss_avg_len_512: 4.715] [val/perplexity_len_512: 111.660]
182
+ [2025-10-26 00:39:55][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:09:51] [ETA: 1:58:56] [loss: 4.656] [tokens/s: 182612.630] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
183
+ [2025-10-26 00:41:04][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:11:00] [ETA: 1:55:51] [loss: 4.634] [tokens/s: 198387.213] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
184
+ [2025-10-26 00:41:04][train:194][INFO] Running validation...
185
+ [2025-10-26 00:42:35][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 4260.374] [val/train_update_time: 2625.819] [val/loss: 4.640] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.780] [val/val_tokens_per_second: 451199.036] [val/loss_avg_len_2048: 4.640] [val/perplexity_len_2048: 103.564] [val/loss_avg_len_1024: 4.654] [val/perplexity_len_1024: 105.014] [val/loss_avg_len_512: 4.680] [val/perplexity_len_512: 107.815]
186
+ [2025-10-26 00:43:44][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:13:40] [ETA: 1:55:13] [loss: 4.640] [tokens/s: 182547.275] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
187
+ [2025-10-26 00:44:53][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:14:49] [ETA: 1:52:14] [loss: 4.548] [tokens/s: 198354.370] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
188
+ [2025-10-26 00:44:53][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4489.443] [train_eval/train_update_time: 2763.837] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.653] [train_eval/perplexity_len_2048: 104.915] [train_eval/loss_avg_len_1024: 4.664] [train_eval/perplexity_len_1024: 106.073] [train_eval/loss_avg_len_512: 4.688] [train_eval/perplexity_len_512: 108.620]
189
+ [2025-10-26 00:44:53][train:194][INFO] Running validation...
190
+ [2025-10-26 00:46:24][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4489.443] [val/train_update_time: 2763.837] [val/loss: 4.608] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.745] [val/val_tokens_per_second: 451376.404] [val/loss_avg_len_2048: 4.608] [val/perplexity_len_2048: 100.291] [val/loss_avg_len_1024: 4.623] [val/perplexity_len_1024: 101.767] [val/loss_avg_len_512: 4.650] [val/perplexity_len_512: 104.609]
191
+ [2025-10-26 00:46:24][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt...
192
+ [2025-10-26 00:46:25][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt.
193
+ [2025-10-26 00:46:25][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.428]
194
+ [2025-10-26 00:47:34][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:17:29] [ETA: 1:51:31] [loss: 4.572] [tokens/s: 182455.996] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
195
+ [2025-10-26 00:48:43][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:18:38] [ETA: 1:48:36] [loss: 4.565] [tokens/s: 198225.170] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
196
+ [2025-10-26 00:48:43][train:194][INFO] Running validation...
197
+ [2025-10-26 00:50:14][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4718.872] [val/train_update_time: 2901.840] [val/loss: 4.577] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.859] [val/val_tokens_per_second: 450810.091] [val/loss_avg_len_2048: 4.577] [val/perplexity_len_2048: 97.256] [val/loss_avg_len_1024: 4.593] [val/perplexity_len_1024: 98.753] [val/loss_avg_len_512: 4.621] [val/perplexity_len_512: 101.602]
198
+ [2025-10-26 00:51:23][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:21:18] [ETA: 1:47:47] [loss: 4.560] [tokens/s: 182399.733] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
199
+ [2025-10-26 00:52:32][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:22:27] [ETA: 1:44:57] [loss: 4.587] [tokens/s: 198169.280] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
200
+ [2025-10-26 00:52:32][train:194][INFO] Running validation...
201
+ [2025-10-26 00:54:03][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 4947.998] [val/train_update_time: 3039.860] [val/loss: 4.550] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.510] [val/val_tokens_per_second: 452545.653] [val/loss_avg_len_2048: 4.550] [val/perplexity_len_2048: 94.613] [val/loss_avg_len_1024: 4.566] [val/perplexity_len_1024: 96.111] [val/loss_avg_len_512: 4.595] [val/perplexity_len_512: 98.972]
202
+ [2025-10-26 00:55:12][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:25:07] [ETA: 1:44:02] [loss: 4.528] [tokens/s: 182405.880] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
203
+ [2025-10-26 00:55:12][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5107.660] [train_eval/train_update_time: 3108.876] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.569] [train_eval/perplexity_len_2048: 96.495] [train_eval/loss_avg_len_1024: 4.584] [train_eval/perplexity_len_1024: 97.918] [train_eval/loss_avg_len_512: 4.612] [train_eval/perplexity_len_512: 100.712]
204
+ [2025-10-26 00:56:21][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:26:16] [ETA: 1:41:17] [loss: 4.497] [tokens/s: 198142.855] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
205
+ [2025-10-26 00:56:21][train:194][INFO] Running validation...
206
+ [2025-10-26 00:57:52][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 5176.881] [val/train_update_time: 3177.900] [val/loss: 4.520] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.758] [val/val_tokens_per_second: 451311.936] [val/loss_avg_len_2048: 4.520] [val/perplexity_len_2048: 91.875] [val/loss_avg_len_1024: 4.537] [val/perplexity_len_1024: 93.391] [val/loss_avg_len_512: 4.567] [val/perplexity_len_512: 96.279]
207
+ [2025-10-26 00:59:01][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:28:56] [ETA: 1:40:18] [loss: 4.519] [tokens/s: 182344.566] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
208
+ [2025-10-26 01:00:10][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:30:05] [ETA: 1:37:36] [loss: 4.495] [tokens/s: 198140.860] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
209
+ [2025-10-26 01:00:10][train:194][INFO] Running validation...
210
+ [2025-10-26 01:01:41][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 5405.913] [val/train_update_time: 3315.925] [val/loss: 4.493] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.736] [val/val_tokens_per_second: 451418.210] [val/loss_avg_len_2048: 4.493] [val/perplexity_len_2048: 89.360] [val/loss_avg_len_1024: 4.510] [val/perplexity_len_1024: 90.887] [val/loss_avg_len_512: 4.541] [val/perplexity_len_512: 93.798]
211
+ [2025-10-26 01:02:50][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:32:45] [ETA: 1:36:32] [loss: 4.491] [tokens/s: 182349.959] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
212
+ [2025-10-26 01:03:59][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:33:54] [ETA: 1:33:54] [loss: 4.464] [tokens/s: 198227.583] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
213
+ [2025-10-26 01:03:59][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5634.922] [train_eval/train_update_time: 3453.965] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.500] [train_eval/perplexity_len_2048: 90.027] [train_eval/loss_avg_len_1024: 4.515] [train_eval/perplexity_len_1024: 91.341] [train_eval/loss_avg_len_512: 4.545] [train_eval/perplexity_len_512: 94.162]
214
+ [2025-10-26 01:03:59][train:194][INFO] Running validation...
215
+ [2025-10-26 01:05:29][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5634.922] [val/train_update_time: 3453.965] [val/loss: 4.469] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.435] [val/val_tokens_per_second: 452922.884] [val/loss_avg_len_2048: 4.469] [val/perplexity_len_2048: 87.242] [val/loss_avg_len_1024: 4.486] [val/perplexity_len_1024: 88.788] [val/loss_avg_len_512: 4.519] [val/perplexity_len_512: 91.727]
216
+ [2025-10-26 01:05:29][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt...
217
+ [2025-10-26 01:05:30][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt.
218
+ [2025-10-26 01:05:30][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.413]
219
+ [2025-10-26 01:06:39][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:36:34] [ETA: 1:32:47] [loss: 4.463] [tokens/s: 182402.852] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
220
+ [2025-10-26 01:07:48][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:37:44] [ETA: 1:30:12] [loss: 4.457] [tokens/s: 198223.784] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
221
+ [2025-10-26 01:07:48][train:194][INFO] Running validation...
222
+ [2025-10-26 01:09:19][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 5864.051] [val/train_update_time: 3591.998] [val/loss: 4.446] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.546] [val/val_tokens_per_second: 452366.332] [val/loss_avg_len_2048: 4.446] [val/perplexity_len_2048: 85.290] [val/loss_avg_len_1024: 4.464] [val/perplexity_len_1024: 86.871] [val/loss_avg_len_512: 4.498] [val/perplexity_len_512: 89.857]
223
+ [2025-10-26 01:10:28][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:40:23] [ETA: 1:29:01] [loss: 4.414] [tokens/s: 182452.528] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
224
+ [2025-10-26 01:11:37][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:41:32] [ETA: 1:26:30] [loss: 4.424] [tokens/s: 198219.905] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
225
+ [2025-10-26 01:11:37][train:194][INFO] Running validation...
226
+ [2025-10-26 01:13:07][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 6092.851] [val/train_update_time: 3730.021] [val/loss: 4.420] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.329] [val/val_tokens_per_second: 453455.804] [val/loss_avg_len_2048: 4.420] [val/perplexity_len_2048: 83.101] [val/loss_avg_len_1024: 4.439] [val/perplexity_len_1024: 84.710] [val/loss_avg_len_512: 4.474] [val/perplexity_len_512: 87.723]
227
+ [2025-10-26 01:14:16][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:44:12] [ETA: 1:25:15] [loss: 4.370] [tokens/s: 182483.006] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
228
+ [2025-10-26 01:14:16][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6252.331] [train_eval/train_update_time: 3799.045] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.434] [train_eval/perplexity_len_2048: 84.302] [train_eval/loss_avg_len_1024: 4.449] [train_eval/perplexity_len_1024: 85.517] [train_eval/loss_avg_len_512: 4.480] [train_eval/perplexity_len_512: 88.204]
229
+ [2025-10-26 01:15:25][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:45:21] [ETA: 1:22:46] [loss: 4.414] [tokens/s: 198315.814] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
230
+ [2025-10-26 01:15:25][train:194][INFO] Running validation...
231
+ [2025-10-26 01:16:56][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 6321.474] [val/train_update_time: 3868.057] [val/loss: 4.398] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.346] [val/val_tokens_per_second: 453369.764] [val/loss_avg_len_2048: 4.398] [val/perplexity_len_2048: 81.305] [val/loss_avg_len_1024: 4.418] [val/perplexity_len_1024: 82.939] [val/loss_avg_len_512: 4.454] [val/perplexity_len_512: 86.012]
232
+ [2025-10-26 01:18:05][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:48:00] [ETA: 1:21:29] [loss: 4.362] [tokens/s: 182558.830] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
233
+ [2025-10-26 01:19:14][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:49:10] [ETA: 1:19:03] [loss: 4.410] [tokens/s: 198388.831] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
234
+ [2025-10-26 01:19:14][train:194][INFO] Running validation...
235
+ [2025-10-26 01:20:44][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 6550.098] [val/train_update_time: 4006.073] [val/loss: 4.376] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.329] [val/val_tokens_per_second: 453451.140] [val/loss_avg_len_2048: 4.376] [val/perplexity_len_2048: 79.530] [val/loss_avg_len_1024: 4.397] [val/perplexity_len_1024: 81.202] [val/loss_avg_len_512: 4.435] [val/perplexity_len_512: 84.355]
236
+ [2025-10-26 01:21:54][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:51:49] [ETA: 1:17:42] [loss: 4.391] [tokens/s: 182619.459] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
237
+ [2025-10-26 01:23:03][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:52:58] [ETA: 1:15:19] [loss: 4.391] [tokens/s: 198482.788] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
238
+ [2025-10-26 01:23:03][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6778.730] [train_eval/train_update_time: 4144.121] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.379] [train_eval/perplexity_len_2048: 79.720] [train_eval/loss_avg_len_1024: 4.394] [train_eval/perplexity_len_1024: 80.957] [train_eval/loss_avg_len_512: 4.429] [train_eval/perplexity_len_512: 83.854]
239
+ [2025-10-26 01:23:03][train:194][INFO] Running validation...
240
+ [2025-10-26 01:24:33][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6778.730] [val/train_update_time: 4144.121] [val/loss: 4.356] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.371] [val/val_tokens_per_second: 453242.896] [val/loss_avg_len_2048: 4.356] [val/perplexity_len_2048: 77.911] [val/loss_avg_len_1024: 4.377] [val/perplexity_len_1024: 79.617] [val/loss_avg_len_512: 4.417] [val/perplexity_len_512: 82.825]
241
+ [2025-10-26 01:24:33][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt...
242
+ [2025-10-26 01:24:34][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt.
243
+ [2025-10-26 01:24:34][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.425]
244
+ [2025-10-26 01:25:43][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:55:38] [ETA: 1:13:56] [loss: 4.355] [tokens/s: 182630.663] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
245
+ [2025-10-26 01:26:52][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:56:47] [ETA: 1:11:35] [loss: 4.330] [tokens/s: 198437.564] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
246
+ [2025-10-26 01:26:52][train:194][INFO] Running validation...
247
+ [2025-10-26 01:28:22][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 7007.788] [val/train_update_time: 4282.136] [val/loss: 4.335] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.333] [val/val_tokens_per_second: 453432.242] [val/loss_avg_len_2048: 4.335] [val/perplexity_len_2048: 76.355] [val/loss_avg_len_1024: 4.358] [val/perplexity_len_1024: 78.108] [val/loss_avg_len_512: 4.399] [val/perplexity_len_512: 81.393]
248
+ [2025-10-26 01:29:31][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:59:27] [ETA: 1:10:09] [loss: 4.340] [tokens/s: 182663.673] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
249
+ [2025-10-26 01:30:40][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:00:36] [ETA: 1:07:50] [loss: 4.330] [tokens/s: 198432.802] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
250
+ [2025-10-26 01:30:40][train:194][INFO] Running validation...
251
+ [2025-10-26 01:32:11][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 7236.399] [val/train_update_time: 4420.153] [val/loss: 4.316] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.292] [val/val_tokens_per_second: 453637.946] [val/loss_avg_len_2048: 4.316] [val/perplexity_len_2048: 74.908] [val/loss_avg_len_1024: 4.340] [val/perplexity_len_1024: 76.702] [val/loss_avg_len_512: 4.383] [val/perplexity_len_512: 80.063]
252
+ [2025-10-26 01:33:20][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:03:15] [ETA: 1:06:22] [loss: 4.304] [tokens/s: 182657.918] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
253
+ [2025-10-26 01:33:20][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7395.903] [train_eval/train_update_time: 4489.238] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.333] [train_eval/perplexity_len_2048: 76.161] [train_eval/loss_avg_len_1024: 4.355] [train_eval/perplexity_len_1024: 77.844] [train_eval/loss_avg_len_512: 4.394] [train_eval/perplexity_len_512: 80.986]
254
+ [2025-10-26 01:34:29][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:04:25] [ETA: 1:04:05] [loss: 4.326] [tokens/s: 198431.420] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
255
+ [2025-10-26 01:34:29][train:194][INFO] Running validation...
256
+ [2025-10-26 01:35:59][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 7465.047] [val/train_update_time: 4558.262] [val/loss: 4.300] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.423] [val/val_tokens_per_second: 452981.250] [val/loss_avg_len_2048: 4.300] [val/perplexity_len_2048: 73.676] [val/loss_avg_len_1024: 4.324] [val/perplexity_len_1024: 75.497] [val/loss_avg_len_512: 4.368] [val/perplexity_len_512: 78.908]
257
+ [2025-10-26 01:37:09][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 2:07:04] [ETA: 1:02:35] [loss: 4.270] [tokens/s: 182644.323] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
258
+ [2025-10-26 01:38:18][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 2:08:13] [ETA: 1:00:20] [loss: 4.284] [tokens/s: 198413.163] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
259
+ [2025-10-26 01:38:18][train:194][INFO] Running validation...
260
+ [2025-10-26 01:39:48][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 7693.756] [val/train_update_time: 4696.292] [val/loss: 4.283] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.484] [val/val_tokens_per_second: 452677.817] [val/loss_avg_len_2048: 4.283] [val/perplexity_len_2048: 72.475] [val/loss_avg_len_1024: 4.308] [val/perplexity_len_1024: 74.326] [val/loss_avg_len_512: 4.354] [val/perplexity_len_512: 77.797]
261
+ [2025-10-26 01:40:57][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 2:10:53] [ETA: 0:58:48] [loss: 4.285] [tokens/s: 182623.466] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
262
+ [2025-10-26 01:42:07][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 2:12:02] [ETA: 0:56:35] [loss: 4.280] [tokens/s: 198479.390] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
263
+ [2025-10-26 01:42:07][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7922.504] [train_eval/train_update_time: 4834.296] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.286] [train_eval/perplexity_len_2048: 72.709] [train_eval/loss_avg_len_1024: 4.310] [train_eval/perplexity_len_1024: 74.408] [train_eval/loss_avg_len_512: 4.354] [train_eval/perplexity_len_512: 77.806]
264
+ [2025-10-26 01:42:07][train:194][INFO] Running validation...
265
+ [2025-10-26 01:43:37][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 7922.504] [val/train_update_time: 4834.296] [val/loss: 4.268] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.493] [val/val_tokens_per_second: 452631.090] [val/loss_avg_len_2048: 4.268] [val/perplexity_len_2048: 71.372] [val/loss_avg_len_1024: 4.294] [val/perplexity_len_1024: 73.263] [val/loss_avg_len_512: 4.341] [val/perplexity_len_512: 76.801]
266
+ [2025-10-26 01:43:37][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt...
267
+ [2025-10-26 01:43:37][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt.
268
+ [2025-10-26 01:43:37][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.418]
269
+ [2025-10-26 01:44:47][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:14:42] [ETA: 0:55:01] [loss: 4.276] [tokens/s: 182603.460] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
270
+ [2025-10-26 01:45:56][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:15:51] [ETA: 0:52:50] [loss: 4.252] [tokens/s: 198359.914] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
271
+ [2025-10-26 01:45:56][train:194][INFO] Running validation...
272
+ [2025-10-26 01:47:26][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 8151.724] [val/train_update_time: 4972.330] [val/loss: 4.254] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.510] [val/val_tokens_per_second: 452548.331] [val/loss_avg_len_2048: 4.254] [val/perplexity_len_2048: 70.376] [val/loss_avg_len_1024: 4.281] [val/perplexity_len_1024: 72.310] [val/loss_avg_len_512: 4.330] [val/perplexity_len_512: 75.922]
273
+ [2025-10-26 01:48:35][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:18:31] [ETA: 0:51:14] [loss: 4.270] [tokens/s: 182564.512] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
274
+ [2025-10-26 01:49:45][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:19:40] [ETA: 0:49:04] [loss: 4.255] [tokens/s: 198260.601] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
275
+ [2025-10-26 01:49:45][train:194][INFO] Running validation...
276
+ [2025-10-26 01:51:16][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 8380.819] [val/train_update_time: 5110.375] [val/loss: 4.241] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.755] [val/val_tokens_per_second: 451326.849] [val/loss_avg_len_2048: 4.241] [val/perplexity_len_2048: 69.511] [val/loss_avg_len_1024: 4.269] [val/perplexity_len_1024: 71.462] [val/loss_avg_len_512: 4.319] [val/perplexity_len_512: 75.112]
277
+ [2025-10-26 01:52:25][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:22:20] [ETA: 0:47:26] [loss: 4.237] [tokens/s: 182421.846] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
278
+ [2025-10-26 01:52:25][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8540.947] [train_eval/train_update_time: 5179.398] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.249] [train_eval/perplexity_len_2048: 70.062] [train_eval/loss_avg_len_1024: 4.274] [train_eval/perplexity_len_1024: 71.799] [train_eval/loss_avg_len_512: 4.322] [train_eval/perplexity_len_512: 75.318]
279
+ [2025-10-26 01:53:34][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:23:30] [ETA: 0:45:19] [loss: 4.199] [tokens/s: 198122.402] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
280
+ [2025-10-26 01:53:34][train:194][INFO] Running validation...
281
+ [2025-10-26 01:55:05][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 8610.334] [val/train_update_time: 5248.407] [val/loss: 4.231] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.346] [val/val_tokens_per_second: 453369.123] [val/loss_avg_len_2048: 4.231] [val/perplexity_len_2048: 68.757] [val/loss_avg_len_1024: 4.259] [val/perplexity_len_1024: 70.759] [val/loss_avg_len_512: 4.311] [val/perplexity_len_512: 74.496]
282
+ [2025-10-26 01:56:14][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:26:09] [ETA: 0:43:39] [loss: 4.263] [tokens/s: 182369.045] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
283
+ [2025-10-26 01:57:23][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:27:19] [ETA: 0:41:33] [loss: 4.185] [tokens/s: 198115.776] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
284
+ [2025-10-26 01:57:23][train:194][INFO] Running validation...
285
+ [2025-10-26 01:58:54][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 8839.132] [val/train_update_time: 5386.446] [val/loss: 4.220] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.385] [val/val_tokens_per_second: 453173.950] [val/loss_avg_len_2048: 4.220] [val/perplexity_len_2048: 68.063] [val/loss_avg_len_1024: 4.249] [val/perplexity_len_1024: 70.070] [val/loss_avg_len_512: 4.302] [val/perplexity_len_512: 73.822]
286
+ [2025-10-26 02:00:03][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:29:58] [ETA: 0:39:52] [loss: 4.214] [tokens/s: 182381.375] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
287
+ [2025-10-26 02:01:12][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:31:07] [ETA: 0:37:46] [loss: 4.192] [tokens/s: 198205.351] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
288
+ [2025-10-26 02:01:12][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9067.838] [train_eval/train_update_time: 5524.482] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.222] [train_eval/perplexity_len_2048: 68.188] [train_eval/loss_avg_len_1024: 4.250] [train_eval/perplexity_len_1024: 70.083] [train_eval/loss_avg_len_512: 4.300] [train_eval/perplexity_len_512: 73.695]
289
+ [2025-10-26 02:01:12][train:194][INFO] Running validation...
290
+ [2025-10-26 02:02:42][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 9067.838] [val/train_update_time: 5524.482] [val/loss: 4.212] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.442] [val/val_tokens_per_second: 452885.529] [val/loss_avg_len_2048: 4.212] [val/perplexity_len_2048: 67.497] [val/loss_avg_len_1024: 4.242] [val/perplexity_len_1024: 69.530] [val/loss_avg_len_512: 4.295] [val/perplexity_len_512: 73.334]
291
+ [2025-10-26 02:02:42][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt...
292
+ [2025-10-26 02:02:43][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt.
293
+ [2025-10-26 02:02:43][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.421]
294
+ [2025-10-26 02:03:52][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:33:47] [ETA: 0:36:04] [loss: 4.168] [tokens/s: 182382.490] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
295
+ [2025-10-26 02:05:01][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:34:56] [ETA: 0:34:00] [loss: 4.179] [tokens/s: 198141.293] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
296
+ [2025-10-26 02:05:01][train:194][INFO] Running validation...
297
+ [2025-10-26 02:06:31][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 9296.993] [val/train_update_time: 5662.514] [val/loss: 4.205] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.358] [val/val_tokens_per_second: 453306.293] [val/loss_avg_len_2048: 4.205] [val/perplexity_len_2048: 66.989] [val/loss_avg_len_1024: 4.234] [val/perplexity_len_1024: 69.027] [val/loss_avg_len_512: 4.288] [val/perplexity_len_512: 72.843]
298
+ [2025-10-26 02:07:41][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:37:36] [ETA: 0:32:16] [loss: 4.212] [tokens/s: 182420.625] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
299
+ [2025-10-26 02:08:50][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:38:45] [ETA: 0:30:14] [loss: 4.161] [tokens/s: 198271.199] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
300
+ [2025-10-26 02:08:50][train:194][INFO] Running validation...
301
+ [2025-10-26 02:10:20][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 9525.670] [val/train_update_time: 5800.553] [val/loss: 4.198] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.729] [val/val_tokens_per_second: 451454.995] [val/loss_avg_len_2048: 4.198] [val/perplexity_len_2048: 66.576] [val/loss_avg_len_1024: 4.229] [val/perplexity_len_1024: 68.627] [val/loss_avg_len_512: 4.283] [val/perplexity_len_512: 72.466]
302
+ [2025-10-26 02:11:30][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:41:25] [ETA: 0:28:29] [loss: 4.230] [tokens/s: 182491.313] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
303
+ [2025-10-26 02:11:30][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9685.575] [train_eval/train_update_time: 5869.583] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.195] [train_eval/perplexity_len_2048: 66.370] [train_eval/loss_avg_len_1024: 4.218] [train_eval/perplexity_len_1024: 67.917] [train_eval/loss_avg_len_512: 4.271] [train_eval/perplexity_len_512: 71.572]
304
+ [2025-10-26 02:12:39][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:42:34] [ETA: 0:26:27] [loss: 4.209] [tokens/s: 198284.923] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
305
+ [2025-10-26 02:12:39][train:194][INFO] Running validation...
306
+ [2025-10-26 02:14:09][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 9754.736] [val/train_update_time: 5938.595] [val/loss: 4.193] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.749] [val/val_tokens_per_second: 451353.559] [val/loss_avg_len_2048: 4.193] [val/perplexity_len_2048: 66.246] [val/loss_avg_len_1024: 4.224] [val/perplexity_len_1024: 68.309] [val/loss_avg_len_512: 4.279] [val/perplexity_len_512: 72.173]
307
+ [2025-10-26 02:15:19][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:45:14] [ETA: 0:24:41] [loss: 4.155] [tokens/s: 182484.674] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
308
+ [2025-10-26 02:16:28][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:46:23] [ETA: 0:22:41] [loss: 4.173] [tokens/s: 198233.899] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
309
+ [2025-10-26 02:16:28][train:194][INFO] Running validation...
310
+ [2025-10-26 02:17:58][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 9983.788] [val/train_update_time: 6076.625] [val/loss: 4.189] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.679] [val/val_tokens_per_second: 451703.453] [val/loss_avg_len_2048: 4.189] [val/perplexity_len_2048: 65.985] [val/loss_avg_len_1024: 4.220] [val/perplexity_len_1024: 68.047] [val/loss_avg_len_512: 4.276] [val/perplexity_len_512: 71.917]
311
+ [2025-10-26 02:19:08][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:49:03] [ETA: 0:20:53] [loss: 4.228] [tokens/s: 182438.062] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
312
+ [2025-10-26 02:20:17][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:50:12] [ETA: 0:18:54] [loss: 4.143] [tokens/s: 198271.009] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
313
+ [2025-10-26 02:20:17][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10212.780] [train_eval/train_update_time: 6214.660] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.189] [train_eval/perplexity_len_2048: 65.932] [train_eval/loss_avg_len_1024: 4.215] [train_eval/perplexity_len_1024: 67.699] [train_eval/loss_avg_len_512: 4.269] [train_eval/perplexity_len_512: 71.441]
314
+ [2025-10-26 02:20:17][train:194][INFO] Running validation...
315
+ [2025-10-26 02:21:48][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 10212.780] [val/train_update_time: 6214.660] [val/loss: 4.186] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.996] [val/val_tokens_per_second: 450130.993] [val/loss_avg_len_2048: 4.186] [val/perplexity_len_2048: 65.792] [val/loss_avg_len_1024: 4.218] [val/perplexity_len_1024: 67.865] [val/loss_avg_len_512: 4.273] [val/perplexity_len_512: 71.752]
316
+ [2025-10-26 02:21:48][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt...
317
+ [2025-10-26 02:21:48][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt.
318
+ [2025-10-26 02:21:48][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.417]
319
+ [2025-10-26 02:22:57][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:52:53] [ETA: 0:17:05] [loss: 4.178] [tokens/s: 182349.471] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
320
+ [2025-10-26 02:24:07][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 2:54:02] [ETA: 0:15:08] [loss: 4.206] [tokens/s: 198067.474] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
321
+ [2025-10-26 02:24:07][train:194][INFO] Running validation...
322
+ [2025-10-26 02:25:37][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 10442.504] [val/train_update_time: 6352.693] [val/loss: 4.184] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.351] [val/val_tokens_per_second: 453340.782] [val/loss_avg_len_2048: 4.184] [val/perplexity_len_2048: 65.651] [val/loss_avg_len_1024: 4.216] [val/perplexity_len_1024: 67.731] [val/loss_avg_len_512: 4.272] [val/perplexity_len_512: 71.633]
323
+ [2025-10-26 02:26:46][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 2:56:42] [ETA: 0:13:18] [loss: 4.191] [tokens/s: 182351.845] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
324
+ [2025-10-26 02:27:55][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 2:57:51] [ETA: 0:11:21] [loss: 4.148] [tokens/s: 198138.717] [batches/s: 0.094] [MFU: 0.000] [TFLOPS: 0.000]
325
+ [2025-10-26 02:27:55][train:194][INFO] Running validation...
326
+ [2025-10-26 02:29:26][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 10671.172] [val/train_update_time: 6490.743] [val/loss: 4.183] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.350] [val/val_tokens_per_second: 453345.845] [val/loss_avg_len_2048: 4.183] [val/perplexity_len_2048: 65.564] [val/loss_avg_len_1024: 4.214] [val/perplexity_len_1024: 67.646] [val/loss_avg_len_512: 4.270] [val/perplexity_len_512: 71.551]
327
+ [2025-10-26 02:30:35][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 3:00:30] [ETA: 0:09:30] [loss: 4.170] [tokens/s: 182410.554] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
328
+ [2025-10-26 02:30:35][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10830.689] [train_eval/train_update_time: 6559.780] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.176] [train_eval/perplexity_len_2048: 65.117] [train_eval/loss_avg_len_1024: 4.206] [train_eval/perplexity_len_1024: 67.086] [train_eval/loss_avg_len_512: 4.259] [train_eval/perplexity_len_512: 70.759]
329
+ [2025-10-26 02:31:44][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 3:01:39] [ETA: 0:07:34] [loss: 4.169] [tokens/s: 198210.554] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
330
+ [2025-10-26 02:31:44][train:194][INFO] Running validation...
331
+ [2025-10-26 02:33:14][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 10899.878] [val/train_update_time: 6628.827] [val/loss: 4.182] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.304] [val/val_tokens_per_second: 453579.534] [val/loss_avg_len_2048: 4.182] [val/perplexity_len_2048: 65.512] [val/loss_avg_len_1024: 4.214] [val/perplexity_len_1024: 67.593] [val/loss_avg_len_512: 4.270] [val/perplexity_len_512: 71.498]
332
+ [2025-10-26 02:34:23][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 3:04:19] [ETA: 0:05:42] [loss: 4.193] [tokens/s: 182477.451] [batches/s: 0.087] [MFU: 0.000] [TFLOPS: 0.000]
333
+ [2025-10-26 02:35:33][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 3:05:28] [ETA: 0:03:47] [loss: 4.168] [tokens/s: 198277.075] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000]
334
+ [2025-10-26 02:35:33][train:194][INFO] Running validation...
335
+ [2025-10-26 02:37:03][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 11128.508] [val/train_update_time: 6766.881] [val/loss: 4.182] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.112] [val/val_tokens_per_second: 454544.122] [val/loss_avg_len_2048: 4.182] [val/perplexity_len_2048: 65.491] [val/loss_avg_len_1024: 4.213] [val/perplexity_len_1024: 67.574] [val/loss_avg_len_512: 4.269] [val/perplexity_len_512: 71.481]
336
+ [2025-10-26 02:37:03][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"step": 209715200, "checkpoint/checkpoint_time": 0.4660885689663701}
2
- {"step": 419430400, "checkpoint/checkpoint_time": 0.4466627230285667}
3
- {"step": 629145600, "checkpoint/checkpoint_time": 0.4551908180001192}
4
- {"step": 838860800, "checkpoint/checkpoint_time": 0.4531451639486477}
5
- {"step": 1048576000, "checkpoint/checkpoint_time": 0.43850367702543736}
6
- {"step": 1258291200, "checkpoint/checkpoint_time": 0.4580780279939063}
7
- {"step": 1468006400, "checkpoint/checkpoint_time": 0.4501485239597969}
8
- {"step": 1677721600, "checkpoint/checkpoint_time": 0.4503479599952698}
9
- {"step": 1887436800, "checkpoint/checkpoint_time": 0.4574753579800017}
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 0.42509283899562433}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 0.4211389650008641}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 0.41736824897816405}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 0.42778749897843227}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 0.41318527003750205}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 0.42481468996265903}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 0.4177210059715435}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 0.4209941530134529}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 0.41716638999059796}
metrics/jsonlines/throughput.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl CHANGED
@@ -1,98 +1,98 @@
1
- {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 72.92804989696015, "train/update_time": 72.72728051099693, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346482276916504}
2
- {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 142.06357847998152, "train/update_time": 141.72754302585963, "train/lr": 0.0009997960964140947, "train/loss": 8.126625061035156, "train/global_grad_norm": 0.962837278842926}
3
- {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 302.1006362909684, "train/update_time": 210.70588018780109, "train/lr": 0.0009990914580222257, "train/loss": 7.519778728485107, "train/global_grad_norm": 0.5695855021476746}
4
- {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 371.19216373196105, "train/update_time": 279.66876631672494, "train/lr": 0.0009978842768382998, "train/loss": 7.193304061889648, "train/global_grad_norm": 0.4217643439769745}
5
- {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 530.599405142013, "train/update_time": 348.61063645477407, "train/lr": 0.0009961757683914405, "train/loss": 6.9472150802612305, "train/global_grad_norm": 0.26760002970695496}
6
- {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 599.6671329609817, "train/update_time": 417.53961712069577, "train/lr": 0.00099396765300483, "train/loss": 6.68041467666626, "train/global_grad_norm": 0.31579363346099854}
7
- {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 835.2338834310067, "train/update_time": 562.1400479745935, "train/lr": 0.0009912621540634887, "train/loss": 6.480125904083252, "train/global_grad_norm": 0.26012396812438965}
8
- {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 1008.031190382957, "train/update_time": 734.6574540784932, "train/lr": 0.000988061995775515, "train/loss": 6.281551837921143, "train/global_grad_norm": 0.39679110050201416}
9
- {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 1376.200831551978, "train/update_time": 864.1369397986564, "train/lr": 0.0009843704004290394, "train/loss": 6.122912406921387, "train/global_grad_norm": 1.23171067237854}
10
- {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1549.172318366007, "train/update_time": 1036.8226832836517, "train/lr": 0.0009801910851476522, "train/loss": 5.9722723960876465, "train/global_grad_norm": 0.3574962913990021}
11
- {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1931.2548154849792, "train/update_time": 1164.8704603727674, "train/lr": 0.0009755282581475768, "train/loss": 5.849911212921143, "train/global_grad_norm": 0.38126564025878906}
12
- {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 2070.106143262994, "train/update_time": 1303.5278403796838, "train/lr": 0.0009703866145003512, "train/loss": 5.7178874015808105, "train/global_grad_norm": 0.6952179670333862}
13
- {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 2470.7233741829987, "train/update_time": 1431.661200570641, "train/lr": 0.0009647713314052896, "train/loss": 5.644232749938965, "train/global_grad_norm": 0.34717857837677}
14
- {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 2598.984462487977, "train/update_time": 1559.7403547617723, "train/lr": 0.0009586880629764817, "train/loss": 5.570384502410889, "train/global_grad_norm": 0.6765910983085632}
15
- {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 3022.92114909098, "train/update_time": 1725.9861086776946, "train/lr": 0.0009521429345495787, "train/loss": 5.444611072540283, "train/global_grad_norm": 0.4169935882091522}
16
- {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 3151.211080375011, "train/update_time": 1854.0981342886225, "train/lr": 0.0009451425365140996, "train/loss": 5.40510368347168, "train/global_grad_norm": 0.709697961807251}
17
- {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 3565.2874158979976, "train/update_time": 2026.7114865826443, "train/lr": 0.000937693917677468, "train/loss": 5.298379421234131, "train/global_grad_norm": 0.35993462800979614}
18
- {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 3717.7666637959774, "train/update_time": 2178.911191943742, "train/lr": 0.0009298045781674596, "train/loss": 5.267183303833008, "train/global_grad_norm": 0.45855849981307983}
19
- {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 4118.412796522956, "train/update_time": 2351.5286739096628, "train/lr": 0.0009214824618802108, "train/loss": 5.240725994110107, "train/global_grad_norm": 0.45877301692962646}
20
- {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 4291.382857583987, "train/update_time": 2524.2801111027366, "train/lr": 0.000912735948481387, "train/loss": 5.148595809936523, "train/global_grad_norm": 0.5232999920845032}
21
- {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 4682.385532672983, "train/update_time": 2691.767115161754, "train/lr": 0.0009035738449685707, "train/loss": 5.102267742156982, "train/global_grad_norm": 0.40673965215682983}
22
- {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 4855.23253793997, "train/update_time": 2864.3428287997376, "train/lr": 0.0008940053768033609, "train/loss": 5.072765827178955, "train/global_grad_norm": 0.540256679058075}
23
- {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 5229.919883315975, "train/update_time": 3004.8879390547518, "train/lr": 0.0008840401786221159, "train/loss": 5.013406276702881, "train/global_grad_norm": 0.4202441871166229}
24
- {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 5402.695571646967, "train/update_time": 3177.3816385426326, "train/lr": 0.0008736882845346905, "train/loss": 4.965211868286133, "train/global_grad_norm": 0.5850781798362732}
25
- {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 5778.858022939006, "train/update_time": 3305.4192761277664, "train/lr": 0.0008629601180209381, "train/loss": 4.961833477020264, "train/global_grad_norm": 0.6340895295143127}
26
- {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 5930.809683200961, "train/update_time": 3457.191259445739, "train/lr": 0.0008518664814351503, "train/loss": 4.912302017211914, "train/global_grad_norm": 0.5044277310371399}
27
- {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 6323.337480602961, "train/update_time": 3585.3503955338383, "train/lr": 0.0008404185451290017, "train/loss": 4.897612571716309, "train/global_grad_norm": 0.4688912034034729}
28
- {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 6451.577037203999, "train/update_time": 3713.4232930167927, "train/lr": 0.0008286278362039527, "train/loss": 4.848834037780762, "train/global_grad_norm": 0.6365319490432739}
29
- {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 6868.8899707439705, "train/update_time": 3866.6987140927813, "train/lr": 0.0008165062269044352, "train/loss": 4.8169732093811035, "train/global_grad_norm": 0.4134746789932251}
30
- {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 6997.080480206001, "train/update_time": 3994.723552759795, "train/lr": 0.0008040659226635089, "train/loss": 4.79654598236084, "train/global_grad_norm": 0.5643511414527893}
31
- {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 7415.715453048004, "train/update_time": 4167.268672269885, "train/lr": 0.0007913194498130252, "train/loss": 4.810868740081787, "train/global_grad_norm": 0.47013285756111145}
32
- {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 7558.353761503007, "train/update_time": 4309.650610209792, "train/lr": 0.000778279642970672, "train/loss": 4.74250602722168, "train/global_grad_norm": 0.5142323970794678}
33
- {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 7964.323172722012, "train/update_time": 4482.200250261696, "train/lr": 0.0007649596321166025, "train/loss": 4.759753704071045, "train/global_grad_norm": 0.5028547644615173}
34
- {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 8133.470018550986, "train/update_time": 4651.074509458733, "train/lr": 0.0007513728293726579, "train/loss": 4.724730491638184, "train/global_grad_norm": 0.5188063383102417}
35
- {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 8528.677464636974, "train/update_time": 4823.7173610687605, "train/lr": 0.0007375329154974975, "train/loss": 4.704092502593994, "train/global_grad_norm": 0.4179239571094513}
36
- {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 8701.49334405997, "train/update_time": 4996.244725472818, "train/lr": 0.0007234538261112341, "train/loss": 4.630825042724609, "train/global_grad_norm": 0.4399227201938629}
37
- {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 9082.585357424978, "train/update_time": 5147.487420658756, "train/lr": 0.0007091497376634464, "train/loss": 4.655548095703125, "train/global_grad_norm": 0.45650508999824524}
38
- {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 9255.459071734978, "train/update_time": 5320.0803467377555, "train/lr": 0.0006946350531586958, "train/loss": 4.63443660736084, "train/global_grad_norm": 0.4673406481742859}
39
- {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 9625.762184636958, "train/update_time": 5448.1719141078065, "train/lr": 0.0006799243876539214, "train/loss": 4.639521598815918, "train/global_grad_norm": 0.5377744436264038}
40
- {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 9790.984413733007, "train/update_time": 5613.200303242775, "train/lr": 0.0006650325535423166, "train/loss": 4.547835826873779, "train/global_grad_norm": 0.5047109127044678}
41
- {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 10177.449093343981, "train/update_time": 5741.199696058873, "train/lr": 0.0006499745456385053, "train/loss": 4.572357654571533, "train/global_grad_norm": 0.6879011392593384}
42
- {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 10305.814423859003, "train/update_time": 5869.396386382927, "train/lr": 0.0006347655260800339, "train/loss": 4.565418720245361, "train/global_grad_norm": 0.428315132856369}
43
- {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 10716.236178891966, "train/update_time": 6009.601114596939, "train/lr": 0.0006194208090603844, "train/loss": 4.560233116149902, "train/global_grad_norm": 0.45447441935539246}
44
- {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 10844.433852619957, "train/update_time": 6137.63313861005, "train/lr": 0.0006039558454088796, "train/loss": 4.5870771408081055, "train/global_grad_norm": 0.7089611887931824}
45
- {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 11269.249533146969, "train/update_time": 6310.408450323041, "train/lr": 0.0005883862070330078, "train/loss": 4.5283427238464355, "train/global_grad_norm": 0.4208521842956543}
46
- {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 11400.883047727984, "train/update_time": 6441.803211065067, "train/lr": 0.0005727275712388317, "train/loss": 4.496908187866211, "train/global_grad_norm": 0.6397818922996521}
47
- {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 11810.873353753996, "train/update_time": 6614.468229389226, "train/lr": 0.0005569957049452703, "train/loss": 4.518903732299805, "train/global_grad_norm": 0.5339348316192627}
48
- {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 11971.117434685002, "train/update_time": 6774.459673628269, "train/lr": 0.0005412064488081482, "train/loss": 4.495401382446289, "train/global_grad_norm": 0.47157326340675354}
49
- {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 12369.724257155962, "train/update_time": 6947.382218524348, "train/lr": 0.0005253757012699972, "train/loss": 4.490736484527588, "train/global_grad_norm": 0.5239655375480652}
50
- {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 12542.405912315007, "train/update_time": 7119.801959850243, "train/lr": 0.0005095194025516734, "train/loss": 4.4643659591674805, "train/global_grad_norm": 0.5247243642807007}
51
- {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 12929.475144713011, "train/update_time": 7280.14286618022, "train/lr": 0.0004936535186019053, "train/loss": 4.463287353515625, "train/global_grad_norm": 0.4336317479610443}
52
- {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 13102.122854663, "train/update_time": 7452.531334599189, "train/lr": 0.00047779402502093696, "train/loss": 4.457107067108154, "train/global_grad_norm": 0.6947441101074219}
53
- {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 13470.766621035, "train/update_time": 7583.479152722284, "train/lr": 0.0004619568909744525, "train/loss": 4.4143757820129395, "train/global_grad_norm": 0.45258453488349915}
54
- {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 13643.746936798969, "train/update_time": 7756.201354784251, "train/lr": 0.00044615806311398067, "train/loss": 4.424180030822754, "train/global_grad_norm": 0.4154273271560669}
55
- {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 14024.221395402972, "train/update_time": 7884.335721797193, "train/lr": 0.0004304134495199673, "train/loss": 4.3700270652771, "train/global_grad_norm": 0.3898273706436157}
56
- {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 14164.158704472007, "train/update_time": 8024.111574862094, "train/lr": 0.0004147389036836882, "train/loss": 4.413632869720459, "train/global_grad_norm": 0.5425747036933899}
57
- {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 14561.449876861996, "train/update_time": 8151.957448216155, "train/lr": 0.0003991502085441259, "train/loss": 4.3622026443481445, "train/global_grad_norm": 0.45439326763153076}
58
- {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 14689.471108104975, "train/update_time": 8279.822117615142, "train/lr": 0.0003836630605958888, "train/loss": 4.410221576690674, "train/global_grad_norm": 0.4280547499656677}
59
- {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 15112.176922021958, "train/update_time": 8444.86228252214, "train/lr": 0.00036829305408417155, "train/loss": 4.391324520111084, "train/global_grad_norm": 0.42996275424957275}
60
- {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 15240.21573638398, "train/update_time": 8572.7469220013, "train/lr": 0.000353055665302672, "train/loss": 4.390552997589111, "train/global_grad_norm": 0.6177342534065247}
61
- {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 15655.03589946701, "train/update_time": 8745.21884978417, "train/lr": 0.0003379662370102746, "train/loss": 4.355296611785889, "train/global_grad_norm": 0.445901095867157}
62
- {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 15806.320744203986, "train/update_time": 8896.237034060294, "train/lr": 0.00032303996298219405, "train/loss": 4.329927444458008, "train/global_grad_norm": 0.4848615527153015}
63
- {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 16208.468227757956, "train/update_time": 9068.704648693325, "train/lr": 0.00030829187271113034, "train/loss": 4.3402838706970215, "train/global_grad_norm": 0.42915236949920654}
64
- {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 16381.60338349198, "train/update_time": 9241.559773176385, "train/lr": 0.0002937368162738445, "train/loss": 4.330328464508057, "train/global_grad_norm": 0.44172123074531555}
65
- {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 16772.29729532299, "train/update_time": 9409.638164493314, "train/lr": 0.0002793894493783894, "train/loss": 4.3035969734191895, "train/global_grad_norm": 0.4424532651901245}
66
- {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 16945.237456377014, "train/update_time": 9582.302119482367, "train/lr": 0.00026526421860705474, "train/loss": 4.325634956359863, "train/global_grad_norm": 0.4446793496608734}
67
- {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 17320.076387455978, "train/update_time": 9723.91070531332, "train/lr": 0.0002513753468698824, "train/loss": 4.269580841064453, "train/global_grad_norm": 0.4529637098312378}
68
- {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 17493.055093375966, "train/update_time": 9896.617073222296, "train/lr": 0.00023773681908340283, "train/loss": 4.283663749694824, "train/global_grad_norm": 0.445527583360672}
69
- {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 17867.821773795004, "train/update_time": 10024.58119243232, "train/lr": 0.00022436236808900823, "train/loss": 4.284794807434082, "train/global_grad_norm": 0.37836042046546936}
70
- {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 18021.479664082, "train/update_time": 10178.065654869366, "train/lr": 0.00021126546082514682, "train/loss": 4.279749870300293, "train/global_grad_norm": 0.3362836241722107}
71
- {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 18413.219313726993, "train/update_time": 10306.263159629423, "train/lr": 0.00019845928476725522, "train/loss": 4.276471138000488, "train/global_grad_norm": 0.3601376414299011}
72
- {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 18541.505700192996, "train/update_time": 10434.371385331382, "train/lr": 0.0001859567346490913, "train/loss": 4.2520365715026855, "train/global_grad_norm": 0.3764491081237793}
73
- {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 18957.16054979898, "train/update_time": 10585.471213102457, "train/lr": 0.00017377039947882782, "train/loss": 4.269729137420654, "train/global_grad_norm": 0.3962520360946655}
74
- {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 19085.297822847962, "train/update_time": 10713.447329278395, "train/lr": 0.00016191254986299043, "train/loss": 4.254550933837891, "train/global_grad_norm": 0.357697457075119}
75
- {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 19505.552313831984, "train/update_time": 10886.11498639133, "train/lr": 0.00015039512565099468, "train/loss": 4.237186431884766, "train/global_grad_norm": 0.34904253482818604}
76
- {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 19645.975325819978, "train/update_time": 11026.27975769632, "train/lr": 0.00013922972391273224, "train/loss": 4.198566436767578, "train/global_grad_norm": 0.3618724048137665}
77
- {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 20051.277047046984, "train/update_time": 11198.93009360932, "train/lr": 0.00012842758726130281, "train/loss": 4.263113975524902, "train/global_grad_norm": 0.3145442306995392}
78
- {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 20219.25819616299, "train/update_time": 11366.63659675623, "train/lr": 0.00011799959253265679, "train/loss": 4.1848530769348145, "train/global_grad_norm": 0.3598962128162384}
79
- {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 20614.83181869099, "train/update_time": 11539.3036280073, "train/lr": 0.00010795623983354214, "train/loss": 4.2140374183654785, "train/global_grad_norm": 0.3123509883880615}
80
- {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 20787.76660131797, "train/update_time": 11711.969741588284, "train/lr": 9.830764196878872e-05, "train/loss": 4.1917405128479, "train/global_grad_norm": 0.31881648302078247}
81
- {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 21170.047469891957, "train/update_time": 11864.524636753194, "train/lr": 8.906351425856951e-05, "train/loss": 4.167685508728027, "train/global_grad_norm": 0.29552316665649414}
82
- {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 21342.972027021984, "train/update_time": 12037.18170108716, "train/lr": 8.02331647558977e-05, "train/loss": 4.179322242736816, "train/global_grad_norm": 0.281093567609787}
83
- {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 21712.53867700696, "train/update_time": 12165.285238003125, "train/lr": 7.182548487420554e-05, "train/loss": 4.211834907531738, "train/global_grad_norm": 0.29659828543663025}
84
- {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 21879.736517031968, "train/update_time": 12332.311124075088, "train/lr": 6.384894043444556e-05, "train/loss": 4.1608757972717285, "train/global_grad_norm": 0.29815351963043213}
85
- {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 22264.311315331957, "train/update_time": 12460.530496544088, "train/lr": 5.6311563140726166e-05, "train/loss": 4.230018138885498, "train/global_grad_norm": 0.2653578221797943}
86
- {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 22393.132838854974, "train/update_time": 12589.186431614158, "train/lr": 4.922094249306547e-05, "train/loss": 4.209297180175781, "train/global_grad_norm": 0.2605638802051544}
87
- {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 22800.889922398957, "train/update_time": 12726.980374863197, "train/lr": 4.2584218145409916e-05, "train/loss": 4.1548752784729, "train/global_grad_norm": 0.2570478022098541}
88
- {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 22929.256941216998, "train/update_time": 12855.194684728282, "train/lr": 3.6408072716606236e-05, "train/loss": 4.172904968261719, "train/global_grad_norm": 0.2740459740161896}
89
- {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 23353.442129588977, "train/update_time": 13028.02226472419, "train/lr": 3.069872506157217e-05, "train/loss": 4.228043079376221, "train/global_grad_norm": 0.25757673382759094}
90
- {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 23483.878039386997, "train/update_time": 13158.19501615403, "train/lr": 2.5461924009435368e-05, "train/loss": 4.143199920654297, "train/global_grad_norm": 0.2552241086959839}
91
- {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 23894.50188044901, "train/update_time": 13330.955791636079, "train/lr": 2.0702942574950812e-05, "train/loss": 4.177771091461182, "train/global_grad_norm": 0.24890665709972382}
92
- {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 24053.666120332957, "train/update_time": 13489.859810477996, "train/lr": 1.642657264902142e-05, "train/loss": 4.206305027008057, "train/global_grad_norm": 0.23305842280387878}
93
- {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 24452.978586556972, "train/update_time": 13662.515431846085, "train/lr": 1.2637120173670358e-05, "train/loss": 4.190739154815674, "train/global_grad_norm": 0.22044338285923004}
94
- {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 24626.054740517982, "train/update_time": 13835.318780667149, "train/lr": 9.338400806321978e-06, "train/loss": 4.147926330566406, "train/global_grad_norm": 0.22512836754322052}
95
- {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 25012.61971083196, "train/update_time": 13996.215564900136, "train/lr": 6.533736077758867e-06, "train/loss": 4.170260429382324, "train/global_grad_norm": 0.22401364147663116}
96
- {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 25185.487074052973, "train/update_time": 14168.81472938013, "train/lr": 4.2259500476214406e-06, "train/loss": 4.168946266174316, "train/global_grad_norm": 0.215094193816185}
97
- {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 25555.08116199699, "train/update_time": 14301.457320146146, "train/lr": 2.417366460819359e-06, "train/loss": 4.192867755889893, "train/global_grad_norm": 0.21194864809513092}
98
- {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 25728.225285313965, "train/update_time": 14474.342566145177, "train/lr": 1.1098064077174619e-06, "train/loss": 4.168134689331055, "train/global_grad_norm": 0.20849043130874634}
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 73.10528613603674, "train/update_time": 72.90000317717204, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346482276916504}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 142.2743778140284, "train/update_time": 141.94980711926473, "train/lr": 0.0009997960964140947, "train/loss": 8.126625061035156, "train/global_grad_norm": 0.962837278842926}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 301.7193861359847, "train/update_time": 210.97775525326142, "train/lr": 0.0009990914580222257, "train/loss": 7.519778728485107, "train/global_grad_norm": 0.5695855021476746}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 370.84139602299547, "train/update_time": 279.98310620122356, "train/lr": 0.0009978842768382998, "train/loss": 7.193304061889648, "train/global_grad_norm": 0.4217643439769745}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 530.2946584849851, "train/update_time": 348.9749472962576, "train/lr": 0.0009961757683914405, "train/loss": 6.9472150802612305, "train/global_grad_norm": 0.26760002970695496}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 599.4013552149991, "train/update_time": 417.94741392938886, "train/lr": 0.00099396765300483, "train/loss": 6.68041467666626, "train/global_grad_norm": 0.31579363346099854}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 758.8880543989944, "train/update_time": 486.9278290383518, "train/lr": 0.0009912621540634887, "train/loss": 6.480125904083252, "train/global_grad_norm": 0.26012396812438965}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 827.9917874370003, "train/update_time": 555.9067850944703, "train/lr": 0.000988061995775515, "train/loss": 6.281551837921143, "train/global_grad_norm": 0.39679110050201416}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 987.436578762019, "train/update_time": 624.905722066469, "train/lr": 0.0009843704004290394, "train/loss": 6.122912406921387, "train/global_grad_norm": 1.23171067237854}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1056.5339453950291, "train/update_time": 693.8810286904918, "train/lr": 0.0009801910851476522, "train/loss": 5.9722723960876465, "train/global_grad_norm": 0.3574962913990021}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1216.3542258110247, "train/update_time": 762.8519944375148, "train/lr": 0.0009755282581475768, "train/loss": 5.849911212921143, "train/global_grad_norm": 0.38126564025878906}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1285.4777986719855, "train/update_time": 831.8405811304692, "train/lr": 0.0009703866145003512, "train/loss": 5.7178874015808105, "train/global_grad_norm": 0.6952179670333862}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1444.8941518919892, "train/update_time": 900.8191652273526, "train/lr": 0.0009647713314052896, "train/loss": 5.644232749938965, "train/global_grad_norm": 0.34717857837677}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1514.0213319549803, "train/update_time": 969.8139603384188, "train/lr": 0.0009586880629764817, "train/loss": 5.570384502410889, "train/global_grad_norm": 0.6765910983085632}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1673.719632464985, "train/update_time": 1038.808761139284, "train/lr": 0.0009521429345495787, "train/loss": 5.444611072540283, "train/global_grad_norm": 0.4169935882091522}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1742.8283430220326, "train/update_time": 1107.798082921363, "train/lr": 0.0009451425365140996, "train/loss": 5.40510368347168, "train/global_grad_norm": 0.709697961807251}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1902.3611605030019, "train/update_time": 1176.7872661272995, "train/lr": 0.000937693917677468, "train/loss": 5.298379421234131, "train/global_grad_norm": 0.35993462800979614}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1971.4918935780297, "train/update_time": 1245.7879514921806, "train/lr": 0.0009298045781674596, "train/loss": 5.267183303833008, "train/global_grad_norm": 0.45855849981307983}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2130.959315415006, "train/update_time": 1314.7911676210933, "train/lr": 0.0009214824618802108, "train/loss": 5.240725994110107, "train/global_grad_norm": 0.45877301692962646}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2200.08279568702, "train/update_time": 1383.788816100161, "train/lr": 0.000912735948481387, "train/loss": 5.148595809936523, "train/global_grad_norm": 0.5232999920845032}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2360.0452178320265, "train/update_time": 1452.785164519155, "train/lr": 0.0009035738449685707, "train/loss": 5.102267742156982, "train/global_grad_norm": 0.40673965215682983}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2429.163681267004, "train/update_time": 1521.7770175782498, "train/lr": 0.0008940053768033609, "train/loss": 5.072765827178955, "train/global_grad_norm": 0.540256679058075}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2589.090652415005, "train/update_time": 1590.7705863612937, "train/lr": 0.0008840401786221159, "train/loss": 5.013406276702881, "train/global_grad_norm": 0.4202441871166229}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2658.232374906016, "train/update_time": 1659.7850940762437, "train/lr": 0.0008736882845346905, "train/loss": 4.965211868286133, "train/global_grad_norm": 0.5850781798362732}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2818.422068757005, "train/update_time": 1728.7950774162891, "train/lr": 0.0008629601180209381, "train/loss": 4.961833477020264, "train/global_grad_norm": 0.6340895295143127}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2887.563738073979, "train/update_time": 1797.8180341873667, "train/lr": 0.0008518664814351503, "train/loss": 4.912302017211914, "train/global_grad_norm": 0.5044277310371399}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3047.4022633209825, "train/update_time": 1866.8200742353802, "train/lr": 0.0008404185451290017, "train/loss": 4.897612571716309, "train/global_grad_norm": 0.4688912034034729}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3116.5153146539815, "train/update_time": 1935.8127364134416, "train/lr": 0.0008286278362039527, "train/loss": 4.848834037780762, "train/global_grad_norm": 0.6365319490432739}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3276.0374568690313, "train/update_time": 2004.8135134153417, "train/lr": 0.0008165062269044352, "train/loss": 4.8169732093811035, "train/global_grad_norm": 0.4134746789932251}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3345.161585650989, "train/update_time": 2073.822410382272, "train/lr": 0.0008040659226635089, "train/loss": 4.79654598236084, "train/global_grad_norm": 0.5643511414527893}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3504.929979883018, "train/update_time": 2142.822337330319, "train/lr": 0.0007913194498130252, "train/loss": 4.810868740081787, "train/global_grad_norm": 0.47013285756111145}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3574.040140778001, "train/update_time": 2211.8163213434746, "train/lr": 0.000778279642970672, "train/loss": 4.74250602722168, "train/global_grad_norm": 0.5142323970794678}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3733.7000970160007, "train/update_time": 2280.8237289965036, "train/lr": 0.0007649596321166025, "train/loss": 4.759753704071045, "train/global_grad_norm": 0.5028547644615173}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3802.836212512979, "train/update_time": 2349.8270930235158, "train/lr": 0.0007513728293726579, "train/loss": 4.724730491638184, "train/global_grad_norm": 0.5188063383102417}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3962.514591771993, "train/update_time": 2418.8274328135885, "train/lr": 0.0007375329154974975, "train/loss": 4.704092502593994, "train/global_grad_norm": 0.4179239571094513}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4031.6405439740047, "train/update_time": 2487.827474080492, "train/lr": 0.0007234538261112341, "train/loss": 4.630825042724609, "train/global_grad_norm": 0.4399227201938629}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4191.252879804, "train/update_time": 2556.8249263644684, "train/lr": 0.0007091497376634464, "train/loss": 4.655548095703125, "train/global_grad_norm": 0.45650508999824524}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4260.37410283502, "train/update_time": 2625.818530491437, "train/lr": 0.0006946350531586958, "train/loss": 4.63443660736084, "train/global_grad_norm": 0.4673406481742859}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4420.292360802006, "train/update_time": 2694.823069378559, "train/lr": 0.0006799243876539214, "train/loss": 4.639521598815918, "train/global_grad_norm": 0.5377744436264038}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4489.443470049009, "train/update_time": 2763.8374379616, "train/lr": 0.0006650325535423166, "train/loss": 4.547835826873779, "train/global_grad_norm": 0.5047109127044678}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4649.7579026630265, "train/update_time": 2832.8449664485524, "train/lr": 0.0006499745456385053, "train/loss": 4.572357654571533, "train/global_grad_norm": 0.6879011392593384}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4718.872143707995, "train/update_time": 2901.8397621414624, "train/lr": 0.0006347655260800339, "train/loss": 4.565418720245361, "train/global_grad_norm": 0.428315132856369}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4878.879235523986, "train/update_time": 2970.8549301693565, "train/lr": 0.0006194208090603844, "train/loss": 4.560233116149902, "train/global_grad_norm": 0.45447441935539246}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4947.998132903012, "train/update_time": 3039.859961154347, "train/lr": 0.0006039558454088796, "train/loss": 4.5870771408081055, "train/global_grad_norm": 0.7089611887931824}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5107.660124632996, "train/update_time": 3108.876222961524, "train/lr": 0.0005883862070330078, "train/loss": 4.5283427238464355, "train/global_grad_norm": 0.4208521842956543}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5176.88071361999, "train/update_time": 3177.899590641551, "train/lr": 0.0005727275712388317, "train/loss": 4.496908187866211, "train/global_grad_norm": 0.6397818922996521}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5336.778706843033, "train/update_time": 3246.908200990525, "train/lr": 0.0005569957049452703, "train/loss": 4.518903732299805, "train/global_grad_norm": 0.5339348316192627}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5405.913057841011, "train/update_time": 3315.925435980549, "train/lr": 0.0005412064488081482, "train/loss": 4.495401382446289, "train/global_grad_norm": 0.47157326340675354}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5565.781587979989, "train/update_time": 3384.9316106255865, "train/lr": 0.0005253757012699972, "train/loss": 4.490736484527588, "train/global_grad_norm": 0.5239655375480652}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5634.922143466014, "train/update_time": 3453.9653959476273, "train/lr": 0.0005095194025516734, "train/loss": 4.4643659591674805, "train/global_grad_norm": 0.5247243642807007}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5794.91904747003, "train/update_time": 3522.983512793493, "train/lr": 0.0004936535186019053, "train/loss": 4.463287353515625, "train/global_grad_norm": 0.4336317479610443}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5864.05088400899, "train/update_time": 3591.9979424396297, "train/lr": 0.00047779402502093696, "train/loss": 4.457107067108154, "train/global_grad_norm": 0.6947441101074219}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6023.732085202006, "train/update_time": 3661.0160827066866, "train/lr": 0.0004619568909744525, "train/loss": 4.4143757820129395, "train/global_grad_norm": 0.45258453488349915}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6092.851395011006, "train/update_time": 3730.020544492756, "train/lr": 0.00044615806311398067, "train/loss": 4.424180030822754, "train/global_grad_norm": 0.4154273271560669}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6252.330847234989, "train/update_time": 3799.045364828722, "train/lr": 0.0004304134495199673, "train/loss": 4.3700270652771, "train/global_grad_norm": 0.3898273706436157}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6321.474026971031, "train/update_time": 3868.0566598027945, "train/lr": 0.0004147389036836882, "train/loss": 4.413632869720459, "train/global_grad_norm": 0.5425747036933899}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6480.964678196993, "train/update_time": 3937.06809708284, "train/lr": 0.0003991502085441259, "train/loss": 4.3622026443481445, "train/global_grad_norm": 0.45439326763153076}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6550.097681444022, "train/update_time": 4006.072604118788, "train/lr": 0.0003836630605958888, "train/loss": 4.410221576690674, "train/global_grad_norm": 0.4280547499656677}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6709.588647256023, "train/update_time": 4075.0957091488526, "train/lr": 0.00036829305408417155, "train/loss": 4.391324520111084, "train/global_grad_norm": 0.42996275424957275}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6778.730279813986, "train/update_time": 4144.1211769738, "train/lr": 0.000353055665302672, "train/loss": 4.390552997589111, "train/global_grad_norm": 0.6177342534065247}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6938.661827062024, "train/update_time": 4213.1262717307545, "train/lr": 0.0003379662370102746, "train/loss": 4.355296611785889, "train/global_grad_norm": 0.445901095867157}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 7007.788445023994, "train/update_time": 4282.135617908789, "train/lr": 0.00032303996298219405, "train/loss": 4.329927444458008, "train/global_grad_norm": 0.4848615527153015}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7167.26421629102, "train/update_time": 4351.144582907902, "train/lr": 0.00030829187271113034, "train/loss": 4.3402838706970215, "train/global_grad_norm": 0.42915236949920654}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7236.399417778011, "train/update_time": 4420.153171113925, "train/lr": 0.0002937368162738445, "train/loss": 4.330328464508057, "train/global_grad_norm": 0.44172123074531555}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7395.902606271033, "train/update_time": 4489.238088100974, "train/lr": 0.0002793894493783894, "train/loss": 4.3035969734191895, "train/global_grad_norm": 0.4424532651901245}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7465.047058130032, "train/update_time": 4558.262438459904, "train/lr": 0.00026526421860705474, "train/loss": 4.325634956359863, "train/global_grad_norm": 0.4446793496608734}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7624.619702999014, "train/update_time": 4627.277077015955, "train/lr": 0.0002513753468698824, "train/loss": 4.269580841064453, "train/global_grad_norm": 0.4529637098312378}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7693.75585325103, "train/update_time": 4696.292382065032, "train/lr": 0.00023773681908340283, "train/loss": 4.283663749694824, "train/global_grad_norm": 0.445527583360672}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7853.381023978989, "train/update_time": 4765.2889736949, "train/lr": 0.00022436236808900823, "train/loss": 4.284794807434082, "train/global_grad_norm": 0.37836042046546936}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 7922.504303377005, "train/update_time": 4834.2961148990435, "train/lr": 0.00021126546082514682, "train/loss": 4.279749870300293, "train/global_grad_norm": 0.3362836241722107}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8082.5694638509885, "train/update_time": 4903.308512775111, "train/lr": 0.00019845928476725522, "train/loss": 4.276471138000488, "train/global_grad_norm": 0.3601376414299011}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8151.724266513018, "train/update_time": 4972.329616118164, "train/lr": 0.0001859567346490913, "train/loss": 4.2520365715026855, "train/global_grad_norm": 0.3764491081237793}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8311.420260019018, "train/update_time": 5041.351476673037, "train/lr": 0.00017377039947882782, "train/loss": 4.269729137420654, "train/global_grad_norm": 0.3962520360946655}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8380.818926771986, "train/update_time": 5110.374592272972, "train/lr": 0.00016191254986299043, "train/loss": 4.254550933837891, "train/global_grad_norm": 0.357697457075119}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8540.947155133996, "train/update_time": 5179.397799044964, "train/lr": 0.00015039512565099468, "train/loss": 4.237186431884766, "train/global_grad_norm": 0.34904253482818604}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8610.333553528006, "train/update_time": 5248.407137244998, "train/lr": 0.00013922972391273224, "train/loss": 4.198566436767578, "train/global_grad_norm": 0.3618724048137665}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 8769.990720756003, "train/update_time": 5317.427211401926, "train/lr": 0.00012842758726130281, "train/loss": 4.263113975524902, "train/global_grad_norm": 0.3145442306995392}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 8839.131734684983, "train/update_time": 5386.445536848798, "train/lr": 0.00011799959253265679, "train/loss": 4.1848530769348145, "train/global_grad_norm": 0.3598962128162384}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 8998.676407739986, "train/update_time": 5455.463913362706, "train/lr": 0.00010795623983354214, "train/loss": 4.2140374183654785, "train/global_grad_norm": 0.3123509883880615}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9067.837796505017, "train/update_time": 5524.4823192786425, "train/lr": 9.830764196878872e-05, "train/loss": 4.1917405128479, "train/global_grad_norm": 0.31881648302078247}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9227.859726689, "train/update_time": 5593.497986814589, "train/lr": 8.906351425856951e-05, "train/loss": 4.167685508728027, "train/global_grad_norm": 0.29552316665649414}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9296.993160478014, "train/update_time": 5662.513914024632, "train/lr": 8.02331647558977e-05, "train/loss": 4.179322242736816, "train/global_grad_norm": 0.281093567609787}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9456.529325363983, "train/update_time": 5731.5381157496595, "train/lr": 7.182548487420554e-05, "train/loss": 4.211834907531738, "train/global_grad_norm": 0.29659828543663025}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9525.670015413023, "train/update_time": 5800.553074025724, "train/lr": 6.384894043444556e-05, "train/loss": 4.1608757972717285, "train/global_grad_norm": 0.29815351963043213}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9685.574564223003, "train/update_time": 5869.583213592763, "train/lr": 5.6311563140726166e-05, "train/loss": 4.230018138885498, "train/global_grad_norm": 0.2653578221797943}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 9754.735574804014, "train/update_time": 5938.594816011784, "train/lr": 4.922094249306547e-05, "train/loss": 4.209297180175781, "train/global_grad_norm": 0.2605638802051544}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 9914.648321971006, "train/update_time": 6007.609873749723, "train/lr": 4.2584218145409916e-05, "train/loss": 4.1548752784729, "train/global_grad_norm": 0.2570478022098541}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 9983.788254795014, "train/update_time": 6076.624669670709, "train/lr": 3.6408072716606236e-05, "train/loss": 4.172904968261719, "train/global_grad_norm": 0.2740459740161896}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10143.620604291034, "train/update_time": 6145.640486821707, "train/lr": 3.069872506157217e-05, "train/loss": 4.228043079376221, "train/global_grad_norm": 0.25757673382759094}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10212.780399380019, "train/update_time": 6214.659892122727, "train/lr": 2.5461924009435368e-05, "train/loss": 4.143199920654297, "train/global_grad_norm": 0.2552241086959839}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10373.353445779998, "train/update_time": 6283.676360294805, "train/lr": 2.0702942574950812e-05, "train/loss": 4.177771091461182, "train/global_grad_norm": 0.24890665709972382}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10442.503988512035, "train/update_time": 6352.692734938697, "train/lr": 1.642657264902142e-05, "train/loss": 4.206305027008057, "train/global_grad_norm": 0.23305842280387878}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 10602.015328517009, "train/update_time": 6421.7104710137355, "train/lr": 1.2637120173670358e-05, "train/loss": 4.190739154815674, "train/global_grad_norm": 0.22044338285923004}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 10671.172117165988, "train/update_time": 6490.743322437804, "train/lr": 9.338400806321978e-06, "train/loss": 4.147926330566406, "train/global_grad_norm": 0.22512836754322052}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 10830.689001108985, "train/update_time": 6559.779601458809, "train/lr": 6.533736077758867e-06, "train/loss": 4.170260429382324, "train/global_grad_norm": 0.22401364147663116}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 10899.877774502034, "train/update_time": 6628.827098570764, "train/lr": 4.2259500476214406e-06, "train/loss": 4.168946266174316, "train/global_grad_norm": 0.215094193816185}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 11059.34338616603, "train/update_time": 6697.84850771277, "train/lr": 2.417366460819359e-06, "train/loss": 4.192867755889893, "train/global_grad_norm": 0.21194864809513092}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 11128.507790005999, "train/update_time": 6766.880580478697, "train/lr": 1.1098064077174619e-06, "train/loss": 4.168134689331055, "train/global_grad_norm": 0.20849043130874634}
metrics/jsonlines/train_eval.jsonl CHANGED
@@ -1,19 +1,19 @@
1
- {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 530.599405142013, "train_eval/train_update_time": 348.61063645477407, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262765104495848, "train_eval/perplexity_len_2048": 3876.7990479882474, "train_eval/loss_avg_len_1024": 8.26361274068222, "train_eval/perplexity_len_1024": 3880.086556257262, "train_eval/loss_avg_len_512": 8.264419558200608, "train_eval/perplexity_len_512": 3883.218341283336}
2
- {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1549.172318366007, "train_eval/train_update_time": 1036.8226832836517, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.399099997472659, "train_eval/perplexity_len_2048": 601.3036194924304, "train_eval/loss_avg_len_1024": 6.403366397288846, "train_eval/perplexity_len_1024": 603.8745014496265, "train_eval/loss_avg_len_512": 6.409683007578133, "train_eval/perplexity_len_512": 607.7010139099035}
3
- {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3022.92114909098, "train_eval/train_update_time": 1725.9861086776946, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.693106125889135, "train_eval/perplexity_len_2048": 296.8141323485163, "train_eval/loss_avg_len_1024": 5.698990526291018, "train_eval/perplexity_len_1024": 298.5658544105799, "train_eval/loss_avg_len_512": 5.710699294427177, "train_eval/perplexity_len_512": 302.08223886516663}
4
- {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4291.382857583987, "train_eval/train_update_time": 2524.2801111027366, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.296922603823786, "train_eval/perplexity_len_2048": 199.7212419010431, "train_eval/loss_avg_len_1024": 5.305337436088958, "train_eval/perplexity_len_1024": 201.40895359804367, "train_eval/loss_avg_len_512": 5.320490509328956, "train_eval/perplexity_len_512": 204.48415878511435}
5
- {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5778.858022939006, "train_eval/train_update_time": 3305.4192761277664, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.045304426316234, "train_eval/perplexity_len_2048": 155.29156684287375, "train_eval/loss_avg_len_1024": 5.053415232166262, "train_eval/perplexity_len_1024": 156.55622837075202, "train_eval/loss_avg_len_512": 5.070610678311423, "train_eval/perplexity_len_512": 159.27156133906544}
6
- {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6997.080480206001, "train_eval/train_update_time": 3994.723552759795, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.881278812076035, "train_eval/perplexity_len_2048": 131.79910244606154, "train_eval/loss_avg_len_1024": 4.889370008184379, "train_eval/perplexity_len_1024": 132.86984076618447, "train_eval/loss_avg_len_512": 4.908291251527554, "train_eval/perplexity_len_512": 135.40783867497828}
7
- {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8528.677464636974, "train_eval/train_update_time": 4823.7173610687605, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.752543167285239, "train_eval/perplexity_len_2048": 115.87860879757943, "train_eval/loss_avg_len_1024": 4.763826194274043, "train_eval/perplexity_len_1024": 117.19347414945925, "train_eval/loss_avg_len_512": 4.785651780011176, "train_eval/perplexity_len_512": 119.77940747075029}
8
- {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9790.984413733007, "train_eval/train_update_time": 5613.200303242775, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.653148743151705, "train_eval/perplexity_len_2048": 104.91481583709675, "train_eval/loss_avg_len_1024": 4.6641259991965125, "train_eval/perplexity_len_1024": 106.07283695212364, "train_eval/loss_avg_len_512": 4.687856853806879, "train_eval/perplexity_len_512": 108.62014133645553}
9
- {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11269.249533146969, "train_eval/train_update_time": 6310.408450323041, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.56949279251452, "train_eval/perplexity_len_2048": 96.49515429403105, "train_eval/loss_avg_len_1024": 4.584133220926888, "train_eval/perplexity_len_1024": 97.91827683495046, "train_eval/loss_avg_len_512": 4.612269650588205, "train_eval/perplexity_len_512": 100.7124725543258}
10
- {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12542.405912315007, "train_eval/train_update_time": 7119.801959850243, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.500111393837432, "train_eval/perplexity_len_2048": 90.02715921272548, "train_eval/loss_avg_len_1024": 4.5146006559921075, "train_eval/perplexity_len_1024": 91.34108222421936, "train_eval/loss_avg_len_512": 4.545015140839531, "train_eval/perplexity_len_512": 94.16185288811836}
11
- {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 14024.221395402972, "train_eval/train_update_time": 7884.335721797193, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.434400985772954, "train_eval/perplexity_len_2048": 84.30161189591196, "train_eval/loss_avg_len_1024": 4.448710203694063, "train_eval/perplexity_len_1024": 85.51657387892722, "train_eval/loss_avg_len_512": 4.479653784418624, "train_eval/perplexity_len_512": 88.20412974467796}
12
- {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 15240.21573638398, "train_eval/train_update_time": 8572.7469220013, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.3785154093765595, "train_eval/perplexity_len_2048": 79.71959454765936, "train_eval/loss_avg_len_1024": 4.3939165947328, "train_eval/perplexity_len_1024": 80.95687412946403, "train_eval/loss_avg_len_512": 4.429078622167507, "train_eval/perplexity_len_512": 83.85411997858606}
13
- {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 16772.29729532299, "train_eval/train_update_time": 9409.638164493314, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.332845308472933, "train_eval/perplexity_len_2048": 76.16067919713348, "train_eval/loss_avg_len_1024": 4.354710251906472, "train_eval/perplexity_len_1024": 77.84426684093609, "train_eval/loss_avg_len_512": 4.39427122400477, "train_eval/perplexity_len_512": 80.98558889804535}
14
- {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 18021.479664082, "train_eval/train_update_time": 10178.065654869366, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.286469663051848, "train_eval/perplexity_len_2048": 72.70932644602816, "train_eval/loss_avg_len_1024": 4.309569447513205, "train_eval/perplexity_len_1024": 74.40844530144317, "train_eval/loss_avg_len_512": 4.354222116721867, "train_eval/perplexity_len_512": 77.80627758807101}
15
- {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 19505.552313831984, "train_eval/train_update_time": 10886.11498639133, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.249375205130373, "train_eval/perplexity_len_2048": 70.06162452534807, "train_eval/loss_avg_len_1024": 4.273874875287056, "train_eval/perplexity_len_1024": 71.7993106682297, "train_eval/loss_avg_len_512": 4.321713214736082, "train_eval/perplexity_len_512": 75.31755296428902}
16
- {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 20787.76660131797, "train_eval/train_update_time": 11711.969741588284, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.2222635463871026, "train_eval/perplexity_len_2048": 68.18765565818079, "train_eval/loss_avg_len_1024": 4.249677161750205, "train_eval/perplexity_len_1024": 70.08278329102363, "train_eval/loss_avg_len_512": 4.299936973010299, "train_eval/perplexity_len_512": 73.69514876983682}
17
- {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 22264.311315331957, "train_eval/train_update_time": 12460.530496544088, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.195244031412512, "train_eval/perplexity_len_2048": 66.36992594802435, "train_eval/loss_avg_len_1024": 4.218284104051746, "train_eval/perplexity_len_1024": 67.9168460075776, "train_eval/loss_avg_len_512": 4.2707039155407625, "train_eval/perplexity_len_512": 71.57199853357183}
18
- {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 23483.878039386997, "train_eval/train_update_time": 13158.19501615403, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.188622776587208, "train_eval/perplexity_len_2048": 65.93192541236388, "train_eval/loss_avg_len_1024": 4.215066284977402, "train_eval/perplexity_len_1024": 67.69865312590402, "train_eval/loss_avg_len_512": 4.268878927308142, "train_eval/perplexity_len_512": 71.4414995941971}
19
- {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 25012.61971083196, "train_eval/train_update_time": 13996.215564900136, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.176184563983006, "train_eval/perplexity_len_2048": 65.11692916224894, "train_eval/loss_avg_len_1024": 4.205971465967996, "train_eval/perplexity_len_1024": 67.08573753155041, "train_eval/loss_avg_len_512": 4.259276238732308, "train_eval/perplexity_len_512": 70.75875247262296}
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 530.2946584849851, "train_eval/train_update_time": 348.9749472962576, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262765104495848, "train_eval/perplexity_len_2048": 3876.7990479882474, "train_eval/loss_avg_len_1024": 8.26361274068222, "train_eval/perplexity_len_1024": 3880.086556257262, "train_eval/loss_avg_len_512": 8.264419558200608, "train_eval/perplexity_len_512": 3883.218341283336}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1056.5339453950291, "train_eval/train_update_time": 693.8810286904918, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.399099997472659, "train_eval/perplexity_len_2048": 601.3036194924304, "train_eval/loss_avg_len_1024": 6.403366397288846, "train_eval/perplexity_len_1024": 603.8745014496265, "train_eval/loss_avg_len_512": 6.409683007578133, "train_eval/perplexity_len_512": 607.7010139099035}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1673.719632464985, "train_eval/train_update_time": 1038.808761139284, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.693106125889135, "train_eval/perplexity_len_2048": 296.8141323485163, "train_eval/loss_avg_len_1024": 5.698990526291018, "train_eval/perplexity_len_1024": 298.5658544105799, "train_eval/loss_avg_len_512": 5.710699294427177, "train_eval/perplexity_len_512": 302.08223886516663}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2200.08279568702, "train_eval/train_update_time": 1383.788816100161, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.296922603823786, "train_eval/perplexity_len_2048": 199.7212419010431, "train_eval/loss_avg_len_1024": 5.305337436088958, "train_eval/perplexity_len_1024": 201.40895359804367, "train_eval/loss_avg_len_512": 5.320490509328956, "train_eval/perplexity_len_512": 204.48415878511435}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2818.422068757005, "train_eval/train_update_time": 1728.7950774162891, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.045304426316234, "train_eval/perplexity_len_2048": 155.29156684287375, "train_eval/loss_avg_len_1024": 5.053415232166262, "train_eval/perplexity_len_1024": 156.55622837075202, "train_eval/loss_avg_len_512": 5.070610678311423, "train_eval/perplexity_len_512": 159.27156133906544}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3345.161585650989, "train_eval/train_update_time": 2073.822410382272, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.881278812076035, "train_eval/perplexity_len_2048": 131.79910244606154, "train_eval/loss_avg_len_1024": 4.889370008184379, "train_eval/perplexity_len_1024": 132.86984076618447, "train_eval/loss_avg_len_512": 4.908291251527554, "train_eval/perplexity_len_512": 135.40783867497828}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3962.514591771993, "train_eval/train_update_time": 2418.8274328135885, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.752543167285239, "train_eval/perplexity_len_2048": 115.87860879757943, "train_eval/loss_avg_len_1024": 4.763826194274043, "train_eval/perplexity_len_1024": 117.19347414945925, "train_eval/loss_avg_len_512": 4.785651780011176, "train_eval/perplexity_len_512": 119.77940747075029}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4489.443470049009, "train_eval/train_update_time": 2763.8374379616, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.653148743151705, "train_eval/perplexity_len_2048": 104.91481583709675, "train_eval/loss_avg_len_1024": 4.6641259991965125, "train_eval/perplexity_len_1024": 106.07283695212364, "train_eval/loss_avg_len_512": 4.687856853806879, "train_eval/perplexity_len_512": 108.62014133645553}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5107.660124632996, "train_eval/train_update_time": 3108.876222961524, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.56949279251452, "train_eval/perplexity_len_2048": 96.49515429403105, "train_eval/loss_avg_len_1024": 4.584133220926888, "train_eval/perplexity_len_1024": 97.91827683495046, "train_eval/loss_avg_len_512": 4.612269650588205, "train_eval/perplexity_len_512": 100.7124725543258}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5634.922143466014, "train_eval/train_update_time": 3453.9653959476273, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.500111393837432, "train_eval/perplexity_len_2048": 90.02715921272548, "train_eval/loss_avg_len_1024": 4.5146006559921075, "train_eval/perplexity_len_1024": 91.34108222421936, "train_eval/loss_avg_len_512": 4.545015140839531, "train_eval/perplexity_len_512": 94.16185288811836}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6252.330847234989, "train_eval/train_update_time": 3799.045364828722, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.434400985772954, "train_eval/perplexity_len_2048": 84.30161189591196, "train_eval/loss_avg_len_1024": 4.448710203694063, "train_eval/perplexity_len_1024": 85.51657387892722, "train_eval/loss_avg_len_512": 4.479653784418624, "train_eval/perplexity_len_512": 88.20412974467796}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6778.730279813986, "train_eval/train_update_time": 4144.1211769738, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.3785154093765595, "train_eval/perplexity_len_2048": 79.71959454765936, "train_eval/loss_avg_len_1024": 4.3939165947328, "train_eval/perplexity_len_1024": 80.95687412946403, "train_eval/loss_avg_len_512": 4.429078622167507, "train_eval/perplexity_len_512": 83.85411997858606}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7395.902606271033, "train_eval/train_update_time": 4489.238088100974, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.332845308472933, "train_eval/perplexity_len_2048": 76.16067919713348, "train_eval/loss_avg_len_1024": 4.354710251906472, "train_eval/perplexity_len_1024": 77.84426684093609, "train_eval/loss_avg_len_512": 4.39427122400477, "train_eval/perplexity_len_512": 80.98558889804535}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7922.504303377005, "train_eval/train_update_time": 4834.2961148990435, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.286469663051848, "train_eval/perplexity_len_2048": 72.70932644602816, "train_eval/loss_avg_len_1024": 4.309569447513205, "train_eval/perplexity_len_1024": 74.40844530144317, "train_eval/loss_avg_len_512": 4.354222116721867, "train_eval/perplexity_len_512": 77.80627758807101}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8540.947155133996, "train_eval/train_update_time": 5179.397799044964, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.249375205130373, "train_eval/perplexity_len_2048": 70.06162452534807, "train_eval/loss_avg_len_1024": 4.273874875287056, "train_eval/perplexity_len_1024": 71.7993106682297, "train_eval/loss_avg_len_512": 4.321713214736082, "train_eval/perplexity_len_512": 75.31755296428902}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9067.837796505017, "train_eval/train_update_time": 5524.4823192786425, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.2222635463871026, "train_eval/perplexity_len_2048": 68.18765565818079, "train_eval/loss_avg_len_1024": 4.249677161750205, "train_eval/perplexity_len_1024": 70.08278329102363, "train_eval/loss_avg_len_512": 4.299936973010299, "train_eval/perplexity_len_512": 73.69514876983682}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9685.574564223003, "train_eval/train_update_time": 5869.583213592763, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.195244031412512, "train_eval/perplexity_len_2048": 66.36992594802435, "train_eval/loss_avg_len_1024": 4.218284104051746, "train_eval/perplexity_len_1024": 67.9168460075776, "train_eval/loss_avg_len_512": 4.2707039155407625, "train_eval/perplexity_len_512": 71.57199853357183}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10212.780399380019, "train_eval/train_update_time": 6214.659892122727, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.188622776587208, "train_eval/perplexity_len_2048": 65.93192541236388, "train_eval/loss_avg_len_1024": 4.215066284977402, "train_eval/perplexity_len_1024": 67.69865312590402, "train_eval/loss_avg_len_512": 4.268878927308142, "train_eval/perplexity_len_512": 71.4414995941971}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10830.689001108985, "train_eval/train_update_time": 6559.779601458809, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.176184563983006, "train_eval/perplexity_len_2048": 65.11692916224894, "train_eval/loss_avg_len_1024": 4.205971465967996, "train_eval/perplexity_len_1024": 67.08573753155041, "train_eval/loss_avg_len_512": 4.259276238732308, "train_eval/perplexity_len_512": 70.75875247262296}
metrics/jsonlines/val.jsonl CHANGED
@@ -1,49 +1,49 @@
1
- {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 142.06357847998152, "val/train_update_time": 141.72754302585963, "val/loss": 8.017322944736389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.9271335270023, "val/val_tokens_per_second": 450470.5956427875, "val/loss_avg_len_2048": 8.017322944736389, "val/perplexity_len_2048": 3033.046820927388, "val/loss_avg_len_1024": 8.016116743054521, "val/perplexity_len_1024": 3029.3905602879668, "val/loss_avg_len_512": 8.016581874255465, "val/perplexity_len_512": 3030.799952108046}
2
- {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 371.19216373196105, "val/train_update_time": 279.66876631672494, "val/loss": 7.168872293418506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.3155298740021, "val/val_tokens_per_second": 453521.1171007103, "val/loss_avg_len_2048": 7.168872293418506, "val/perplexity_len_2048": 1298.379585700498, "val/loss_avg_len_1024": 7.169298829473462, "val/perplexity_len_1024": 1298.933509532663, "val/loss_avg_len_512": 7.17260874950029, "val/perplexity_len_512": 1303.2399987050917}
3
- {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 599.6671329609817, "val/train_update_time": 417.53961712069577, "val/loss": 6.680456670384901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.79419281100854, "val/val_tokens_per_second": 451130.17398876767, "val/loss_avg_len_2048": 6.680456670384901, "val/perplexity_len_2048": 796.6828504192507, "val/loss_avg_len_1024": 6.681968356456887, "val/perplexity_len_1024": 797.8880955346282, "val/loss_avg_len_512": 6.6880630861138926, "val/perplexity_len_512": 802.7658569931743}
4
- {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 1008.031190382957, "val/train_update_time": 734.6574540784932, "val/loss": 6.256492450360163, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 238.41674322698964, "val/val_tokens_per_second": 171800.01473723334, "val/loss_avg_len_2048": 6.256492450360163, "val/perplexity_len_2048": 521.3869384996046, "val/loss_avg_len_1024": 6.25937858268139, "val/perplexity_len_1024": 522.8939037992483, "val/loss_avg_len_512": 6.268213871597686, "val/perplexity_len_512": 527.5342919101196}
5
- {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1549.172318366007, "val/train_update_time": 1036.8226832836517, "val/loss": 5.9596897887737725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 253.29563638399122, "val/val_tokens_per_second": 161708.27332337238, "val/loss_avg_len_2048": 5.9596897887737725, "val/perplexity_len_2048": 387.48990187397294, "val/loss_avg_len_1024": 5.963750460020918, "val/perplexity_len_1024": 389.0665699760066, "val/loss_avg_len_512": 5.9747771193729715, "val/perplexity_len_512": 393.38041444619915}
6
- {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 2070.106143262994, "val/train_update_time": 1303.5278403796838, "val/loss": 5.729621500730747, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 272.19370538101066, "val/val_tokens_per_second": 150481.0698787656, "val/loss_avg_len_2048": 5.729621500730747, "val/perplexity_len_2048": 307.8527242916948, "val/loss_avg_len_1024": 5.73466736189276, "val/perplexity_len_1024": 309.4100320720618, "val/loss_avg_len_512": 5.747293829907757, "val/perplexity_len_512": 313.34155634560165}
7
- {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 2598.984462487977, "val/train_update_time": 1559.7403547617723, "val/loss": 5.54191019657671, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 257.4766585089965, "val/val_tokens_per_second": 159082.3814367966, "val/loss_avg_len_2048": 5.54191019657671, "val/perplexity_len_2048": 255.1649494383086, "val/loss_avg_len_1024": 5.5479404277496975, "val/perplexity_len_1024": 256.70830177953565, "val/loss_avg_len_512": 5.5618576472472405, "val/perplexity_len_512": 260.3059440825885}
8
- {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 3151.211080375011, "val/train_update_time": 1854.0981342886225, "val/loss": 5.395747513790498, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 241.25792559498223, "val/val_tokens_per_second": 169776.80587688804, "val/loss_avg_len_2048": 5.395747513790498, "val/perplexity_len_2048": 220.46688755473716, "val/loss_avg_len_1024": 5.40283216586914, "val/perplexity_len_1024": 222.03436470678773, "val/loss_avg_len_512": 5.417992734318786, "val/perplexity_len_512": 225.42617783355703}
9
- {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 3717.7666637959774, "val/train_update_time": 2178.911191943742, "val/loss": 5.257520105597726, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 227.81780075002462, "val/val_tokens_per_second": 179792.79874158636, "val/loss_avg_len_2048": 5.257520105597726, "val/perplexity_len_2048": 192.0047489041577, "val/loss_avg_len_1024": 5.265500482419599, "val/perplexity_len_1024": 193.54314949562067, "val/loss_avg_len_512": 5.282038998350409, "val/perplexity_len_512": 196.77068168657516}
10
- {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 4291.382857583987, "val/train_update_time": 2524.2801111027366, "val/loss": 5.150704617314763, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 222.7833782809903, "val/val_tokens_per_second": 183855.72710158981, "val/loss_avg_len_2048": 5.150704617314763, "val/perplexity_len_2048": 172.55303134546992, "val/loss_avg_len_1024": 5.1593652144801805, "val/perplexity_len_1024": 174.0539336132167, "val/loss_avg_len_512": 5.177391785788723, "val/perplexity_len_512": 177.21998000419174}
11
- {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 4855.23253793997, "val/train_update_time": 2864.3428287997376, "val/loss": 5.0635993114376445, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 233.8825490900199, "val/val_tokens_per_second": 175130.63783238808, "val/loss_avg_len_2048": 5.0635993114376445, "val/perplexity_len_2048": 158.15875569300152, "val/loss_avg_len_1024": 5.0730407805304045, "val/perplexity_len_1024": 159.6590781757551, "val/loss_avg_len_512": 5.092240632939898, "val/perplexity_len_512": 162.75412606608745}
12
- {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 5402.695571646967, "val/train_update_time": 3177.3816385426326, "val/loss": 4.98549556239089, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 247.86481602699496, "val/val_tokens_per_second": 165251.36829238018, "val/loss_avg_len_2048": 4.98549556239089, "val/perplexity_len_2048": 146.2760459748089, "val/loss_avg_len_1024": 4.995756369349081, "val/perplexity_len_1024": 147.78468292514813, "val/loss_avg_len_512": 5.016161771441624, "val/perplexity_len_512": 150.8312664737655}
13
- {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 5930.809683200961, "val/train_update_time": 3457.191259445739, "val/loss": 4.916477123672562, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 264.10963063000236, "val/val_tokens_per_second": 155087.11250814577, "val/loss_avg_len_2048": 4.916477123672562, "val/perplexity_len_2048": 136.5208190724984, "val/loss_avg_len_1024": 4.927128413101426, "val/perplexity_len_1024": 137.98271353908035, "val/loss_avg_len_512": 4.948208645739966, "val/perplexity_len_512": 140.92229592495846}
14
- {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 6451.577037203999, "val/train_update_time": 3713.4232930167927, "val/loss": 4.863091215804801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 263.84062808501767, "val/val_tokens_per_second": 155245.23382654096, "val/loss_avg_len_2048": 4.863091215804801, "val/perplexity_len_2048": 129.42366084863215, "val/loss_avg_len_1024": 4.874493102245079, "val/perplexity_len_1024": 130.9077795303594, "val/loss_avg_len_512": 4.896728463353682, "val/perplexity_len_512": 133.85116361495074}
15
- {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 6997.080480206001, "val/train_update_time": 3994.723552759795, "val/loss": 4.811523659892753, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 245.4330290229991, "val/val_tokens_per_second": 166888.7034603713, "val/loss_avg_len_2048": 4.811523659892753, "val/perplexity_len_2048": 122.91876129597873, "val/loss_avg_len_1024": 4.8232065941833895, "val/perplexity_len_1024": 124.36323452041226, "val/loss_avg_len_512": 4.846166890252475, "val/perplexity_len_512": 127.2516841422241}
16
- {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 7558.353761503007, "val/train_update_time": 4309.650610209792, "val/loss": 4.760587245357363, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 233.23207824397832, "val/val_tokens_per_second": 175619.06710428037, "val/loss_avg_len_2048": 4.760587245357363, "val/perplexity_len_2048": 116.8145045362375, "val/loss_avg_len_1024": 4.77283736684951, "val/perplexity_len_1024": 118.25429722128945, "val/loss_avg_len_512": 4.796683278769628, "val/perplexity_len_512": 121.10806894510807}
17
- {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 8133.470018550986, "val/train_update_time": 4651.074509458733, "val/loss": 4.719228506370658, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 222.3557770110201, "val/val_tokens_per_second": 184209.29085179555, "val/loss_avg_len_2048": 4.719228506370658, "val/perplexity_len_2048": 112.08174894828639, "val/loss_avg_len_1024": 4.73204894817751, "val/perplexity_len_1024": 113.52793706523403, "val/loss_avg_len_512": 4.756577379063424, "val/perplexity_len_512": 116.3470318696743}
18
- {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 8701.49334405997, "val/train_update_time": 4996.244725472818, "val/loss": 4.676367494543736, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 229.5753246319946, "val/val_tokens_per_second": 178416.38715161657, "val/loss_avg_len_2048": 4.676367494543736, "val/perplexity_len_2048": 107.37930735283909, "val/loss_avg_len_1024": 4.689829182334012, "val/perplexity_len_1024": 108.8345873493113, "val/loss_avg_len_512": 4.7154578478252525, "val/perplexity_len_512": 111.65992272494637}
19
- {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 9255.459071734978, "val/train_update_time": 5320.0803467377555, "val/loss": 4.640193889026716, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 241.94053427799372, "val/val_tokens_per_second": 169297.799239115, "val/loss_avg_len_2048": 4.640193889026716, "val/perplexity_len_2048": 103.56442564245071, "val/loss_avg_len_1024": 4.654089609145093, "val/perplexity_len_1024": 105.01357307089665, "val/loss_avg_len_512": 4.680419558078237, "val/perplexity_len_512": 107.81529786259468}
20
- {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 9790.984413733007, "val/train_update_time": 5613.200303242775, "val/loss": 4.608071265847772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 257.74714238900924, "val/val_tokens_per_second": 158915.4378991346, "val/loss_avg_len_2048": 4.608071265847772, "val/perplexity_len_2048": 100.29052920641857, "val/loss_avg_len_1024": 4.622682944629249, "val/perplexity_len_1024": 101.76670061160816, "val/loss_avg_len_512": 4.650229472655617, "val/perplexity_len_512": 104.60898772530528}
21
- {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 10305.814423859003, "val/train_update_time": 5869.396386382927, "val/loss": 4.577349349257373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 270.03506823600037, "val/val_tokens_per_second": 151684.00262814204, "val/loss_avg_len_2048": 4.577349349257373, "val/perplexity_len_2048": 97.25625986875657, "val/loss_avg_len_1024": 4.592617217212357, "val/perplexity_len_1024": 98.75254910922418, "val/loss_avg_len_512": 4.621059415361099, "val/perplexity_len_512": 101.60161344282939}
22
- {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 10844.433852619957, "val/train_update_time": 6137.63313861005, "val/loss": 4.549797477854183, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 251.83948333299486, "val/val_tokens_per_second": 162643.28157726015, "val/loss_avg_len_2048": 4.549797477854183, "val/perplexity_len_2048": 94.61324509708179, "val/loss_avg_len_1024": 4.565505847024965, "val/perplexity_len_1024": 96.11119928630141, "val/loss_avg_len_512": 4.594841379802302, "val/perplexity_len_512": 98.97243527526068}
23
- {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 11400.883047727984, "val/train_update_time": 6441.803211065067, "val/loss": 4.5204342533537885, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 237.13705036602914, "val/val_tokens_per_second": 172727.12103307704, "val/loss_avg_len_2048": 4.5204342533537885, "val/perplexity_len_2048": 91.87548655482323, "val/loss_avg_len_1024": 4.536794685186399, "val/perplexity_len_1024": 93.39097238779135, "val/loss_avg_len_512": 4.567255290885735, "val/perplexity_len_512": 96.27948759639831}
24
- {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 11971.117434685002, "val/train_update_time": 6774.459673628269, "val/loss": 4.492667575135734, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 225.4844003280159, "val/val_tokens_per_second": 181653.36466919578, "val/loss_avg_len_2048": 4.492667575135734, "val/perplexity_len_2048": 89.35950140608207, "val/loss_avg_len_1024": 4.509617001681402, "val/perplexity_len_1024": 90.88700227462165, "val/loss_avg_len_512": 4.5411422349753785, "val/perplexity_len_512": 93.7978781707472}
25
- {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 12542.405912315007, "val/train_update_time": 7119.801959850243, "val/loss": 4.4686831552135295, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 226.0238120619906, "val/val_tokens_per_second": 181219.84416741927, "val/loss_avg_len_2048": 4.4686831552135295, "val/perplexity_len_2048": 87.24176347672332, "val/loss_avg_len_1024": 4.486252108311607, "val/perplexity_len_1024": 88.78805350190629, "val/loss_avg_len_512": 4.51881691169506, "val/perplexity_len_512": 91.72701260192157}
26
- {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 13102.122854663, "val/train_update_time": 7452.531334599189, "val/loss": 4.4460520937834875, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 237.48496917897137, "val/val_tokens_per_second": 172474.07337654318, "val/loss_avg_len_2048": 4.4460520937834875, "val/perplexity_len_2048": 85.28956326961926, "val/loss_avg_len_1024": 4.464425405966584, "val/perplexity_len_1024": 86.87109958090244, "val/loss_avg_len_512": 4.498216118935217, "val/perplexity_len_512": 89.85669458704189}
27
- {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 13643.746936798969, "val/train_update_time": 7756.201354784251, "val/loss": 4.420050854198402, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 252.08845614700112, "val/val_tokens_per_second": 162482.64845620247, "val/loss_avg_len_2048": 4.420050854198402, "val/perplexity_len_2048": 83.10051126077735, "val/loss_avg_len_1024": 4.439237378784268, "val/perplexity_len_1024": 84.71031515065614, "val/loss_avg_len_512": 4.474183511526417, "val/perplexity_len_512": 87.7229463868}
28
- {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 14164.158704472007, "val/train_update_time": 8024.111574862094, "val/loss": 4.398202258219151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 269.1987454550108, "val/val_tokens_per_second": 152155.2410311858, "val/loss_avg_len_2048": 4.398202258219151, "val/perplexity_len_2048": 81.30457257597935, "val/loss_avg_len_1024": 4.418109852228035, "val/perplexity_len_1024": 82.93936944356574, "val/loss_avg_len_512": 4.454481553460378, "val/perplexity_len_512": 86.01154689490046}
29
- {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 14689.471108104975, "val/train_update_time": 8279.822117615142, "val/loss": 4.376139390771115, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 257.45918424101546, "val/val_tokens_per_second": 159093.1786750947, "val/loss_avg_len_2048": 4.376139390771115, "val/perplexity_len_2048": 79.53040415674566, "val/loss_avg_len_1024": 4.396944615813718, "val/perplexity_len_1024": 81.2023847690807, "val/loss_avg_len_512": 4.435031853418239, "val/perplexity_len_512": 84.35481183459801}
30
- {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 15240.21573638398, "val/train_update_time": 8572.7469220013, "val/loss": 4.355563867319981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 241.68896219797898, "val/val_tokens_per_second": 169474.01994488976, "val/loss_avg_len_2048": 4.355563867319981, "val/perplexity_len_2048": 77.9107442760098, "val/loss_avg_len_1024": 4.377231672070688, "val/perplexity_len_1024": 79.61732119023712, "val/loss_avg_len_512": 4.4167310255174534, "val/perplexity_len_512": 82.82508923002905}
31
- {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 15806.320744203986, "val/train_update_time": 8896.237034060294, "val/loss": 4.335396474500792, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 229.482882693992, "val/val_tokens_per_second": 178488.25811822677, "val/loss_avg_len_2048": 4.335396474500792, "val/perplexity_len_2048": 76.35522578937196, "val/loss_avg_len_1024": 4.358097750052391, "val/perplexity_len_1024": 78.10841129235861, "val/loss_avg_len_512": 4.399293633644097, "val/perplexity_len_512": 81.3933548269956}
32
- {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 16381.60338349198, "val/train_update_time": 9241.559773176385, "val/loss": 4.31625511668981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 222.342165366048, "val/val_tokens_per_second": 184220.56802661082, "val/loss_avg_len_2048": 4.31625511668981, "val/perplexity_len_2048": 74.90758222363428, "val/loss_avg_len_1024": 4.339928405715012, "val/perplexity_len_1024": 76.70204771345033, "val/loss_avg_len_512": 4.382818944332655, "val/perplexity_len_512": 80.06340988951868}
33
- {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 16945.237456377014, "val/train_update_time": 9582.302119482367, "val/loss": 4.2996819401991555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 232.9728370330413, "val/val_tokens_per_second": 175814.48773871807, "val/loss_avg_len_2048": 4.2996819401991555, "val/perplexity_len_2048": 73.67635648530485, "val/loss_avg_len_1024": 4.324086923091021, "val/perplexity_len_1024": 75.49654722507492, "val/loss_avg_len_512": 4.368286226595659, "val/perplexity_len_512": 78.90828483582929}
34
- {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 17493.055093375966, "val/train_update_time": 9896.617073222296, "val/loss": 4.283236857734924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.54381514398847, "val/val_tokens_per_second": 166136.79794026964, "val/loss_avg_len_2048": 4.283236857734924, "val/perplexity_len_2048": 72.47465088349, "val/loss_avg_len_1024": 4.308458720062673, "val/perplexity_len_1024": 74.32584368113129, "val/loss_avg_len_512": 4.3540999811033725, "val/perplexity_len_512": 77.7967752505338}
35
- {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 18021.479664082, "val/train_update_time": 10178.065654869366, "val/loss": 4.267898958559893, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 262.80700952303596, "val/val_tokens_per_second": 155855.81250035003, "val/loss_avg_len_2048": 4.267898958559893, "val/perplexity_len_2048": 71.371523450084, "val/loss_avg_len_1024": 4.29405301307696, "val/perplexity_len_1024": 73.26280266811683, "val/loss_avg_len_512": 4.3412228272167965, "val/perplexity_len_512": 76.80139677915308}
36
- {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 18541.505700192996, "val/train_update_time": 10434.371385331382, "val/loss": 4.25385695036254, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 264.3623202079907, "val/val_tokens_per_second": 154938.87316382362, "val/loss_avg_len_2048": 4.25385695036254, "val/perplexity_len_2048": 70.3763275596729, "val/loss_avg_len_1024": 4.280959435730102, "val/perplexity_len_1024": 72.30978332653642, "val/loss_avg_len_512": 4.3297005797375, "val/perplexity_len_512": 75.92155071492496}
37
- {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 19085.297822847962, "val/train_update_time": 10713.447329278395, "val/loss": 4.241492192271679, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 247.3863678930211, "val/val_tokens_per_second": 165570.96637480284, "val/loss_avg_len_2048": 4.241492192271679, "val/perplexity_len_2048": 69.51149901038498, "val/loss_avg_len_1024": 4.26916799461185, "val/perplexity_len_1024": 71.46215398096777, "val/loss_avg_len_512": 4.318984106020722, "val/perplexity_len_512": 75.11228340295357}
38
- {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 19645.975325819978, "val/train_update_time": 11026.27975769632, "val/loss": 4.230574601543066, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 232.45659935899312, "val/val_tokens_per_second": 176204.93508443545, "val/loss_avg_len_2048": 4.230574601543066, "val/perplexity_len_2048": 68.75672854774076, "val/loss_avg_len_1024": 4.259278581639892, "val/perplexity_len_1024": 70.75891825403492, "val/loss_avg_len_512": 4.310739070640505, "val/perplexity_len_512": 74.49552605583897}
39
- {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 20219.25819616299, "val/train_update_time": 11366.63659675623, "val/loss": 4.220437906114012, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 222.7093477669987, "val/val_tokens_per_second": 183916.84233592596, "val/loss_avg_len_2048": 4.220437906114012, "val/perplexity_len_2048": 68.06328309221031, "val/loss_avg_len_1024": 4.2494930887183635, "val/perplexity_len_1024": 70.0698841278538, "val/loss_avg_len_512": 4.301655115112848, "val/perplexity_len_512": 73.82187634450761}
40
- {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 20787.76660131797, "val/train_update_time": 11711.969741588284, "val/loss": 4.212082446529414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 229.00648188899504, "val/val_tokens_per_second": 178859.5661665782, "val/loss_avg_len_2048": 4.212082446529414, "val/perplexity_len_2048": 67.49695235274098, "val/loss_avg_len_1024": 4.241754451114219, "val/perplexity_len_1024": 69.52973140635773, "val/loss_avg_len_512": 4.295026430321672, "val/perplexity_len_512": 73.33415266465448}
41
- {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 21342.972027021984, "val/train_update_time": 12037.18170108716, "val/loss": 4.20452369724242, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 241.201147957996, "val/val_tokens_per_second": 169816.7705534013, "val/loss_avg_len_2048": 4.20452369724242, "val/perplexity_len_2048": 66.9886831719058, "val/loss_avg_len_1024": 4.23449458040651, "val/perplexity_len_1024": 69.02678242730825, "val/loss_avg_len_512": 4.288312720157765, "val/perplexity_len_512": 72.84345745438628}
42
- {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 21879.736517031968, "val/train_update_time": 12332.311124075088, "val/loss": 4.198343608177501, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 256.09363065700745, "val/val_tokens_per_second": 159941.50223462115, "val/loss_avg_len_2048": 4.198343608177501, "val/perplexity_len_2048": 66.57596377846592, "val/loss_avg_len_1024": 4.228689314186201, "val/perplexity_len_1024": 68.62722447126744, "val/loss_avg_len_512": 4.283119718784839, "val/perplexity_len_512": 72.46616177619002}
43
- {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 22393.132838854974, "val/train_update_time": 12589.186431614158, "val/loss": 4.193367760937754, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 269.71012579498347, "val/val_tokens_per_second": 151866.7490857766, "val/loss_avg_len_2048": 4.193367760937754, "val/perplexity_len_2048": 66.24551476656741, "val/loss_avg_len_1024": 4.224034787101439, "val/perplexity_len_1024": 68.30853943562786, "val/loss_avg_len_512": 4.2790641182546505, "val/perplexity_len_512": 72.17286312516411}
44
- {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 22929.256941216998, "val/train_update_time": 12855.194684728282, "val/loss": 4.189421963141696, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 251.1456275290111, "val/val_tokens_per_second": 163092.62638971687, "val/loss_avg_len_2048": 4.189421963141696, "val/perplexity_len_2048": 65.98463838160738, "val/loss_avg_len_1024": 4.22020412462051, "val/perplexity_len_1024": 68.04737301604953, "val/loss_avg_len_512": 4.2755115066579545, "val/perplexity_len_512": 71.9169158844202}
45
- {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 23483.878039386997, "val/train_update_time": 13158.19501615403, "val/loss": 4.186491362652555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 237.201484196994, "val/val_tokens_per_second": 172680.20113222834, "val/loss_avg_len_2048": 4.186491362652555, "val/perplexity_len_2048": 65.79154684336461, "val/loss_avg_len_1024": 4.217514740810637, "val/perplexity_len_1024": 67.86461337831629, "val/loss_avg_len_512": 4.273214724269416, "val/perplexity_len_512": 71.75192792183111}
46
- {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 24053.666120332957, "val/train_update_time": 13489.859810477996, "val/loss": 4.184350719433324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 226.46256143203937, "val/val_tokens_per_second": 180868.7481983284, "val/loss_avg_len_2048": 4.184350719433324, "val/perplexity_len_2048": 65.65086124728785, "val/loss_avg_len_1024": 4.215543735674023, "val/perplexity_len_1024": 67.73098361249181, "val/loss_avg_len_512": 4.271550920667592, "val/perplexity_len_512": 71.63264606402603}
47
- {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 24626.054740517982, "val/train_update_time": 13835.318780667149, "val/loss": 4.183023557277815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 225.38604856299935, "val/val_tokens_per_second": 181732.6327922687, "val/loss_avg_len_2048": 4.183023557277815, "val/perplexity_len_2048": 65.56378970057509, "val/loss_avg_len_1024": 4.214286808201578, "val/perplexity_len_1024": 67.64590415900594, "val/loss_avg_len_512": 4.270415190260951, "val/perplexity_len_512": 71.5513368711842}
48
- {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 25185.487074052973, "val/train_update_time": 14168.81472938013, "val/loss": 4.18224084139755, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 236.68519899004605, "val/val_tokens_per_second": 173056.8712145055, "val/loss_avg_len_2048": 4.18224084139755, "val/perplexity_len_2048": 65.51249195960324, "val/loss_avg_len_1024": 4.213500716115302, "val/perplexity_len_1024": 67.59274914418302, "val/loss_avg_len_512": 4.269666781060863, "val/perplexity_len_512": 71.4978072259293}
49
- {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 25728.225285313965, "val/train_update_time": 14474.342566145177, "val/loss": 4.1819093990348515, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 250.69932467199396, "val/val_tokens_per_second": 163382.96903508058, "val/loss_avg_len_2048": 4.1819093990348515, "val/perplexity_len_2048": 65.49078194249032, "val/loss_avg_len_1024": 4.213224817466876, "val/perplexity_len_1024": 67.57410296839636, "val/loss_avg_len_512": 4.269427753666788, "val/perplexity_len_512": 71.48071933370453}
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 142.2743778140284, "val/train_update_time": 141.94980711926473, "val/loss": 8.017322944736389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.2959264080273, "val/val_tokens_per_second": 453619.5776419728, "val/loss_avg_len_2048": 8.017322944736389, "val/perplexity_len_2048": 3033.046820927388, "val/loss_avg_len_1024": 8.016116743054521, "val/perplexity_len_1024": 3029.3905602879668, "val/loss_avg_len_512": 8.016581874255465, "val/perplexity_len_512": 3030.799952108046}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 370.84139602299547, "val/train_update_time": 279.98310620122356, "val/loss": 7.168872293418506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32955138100078, "val/val_tokens_per_second": 453450.71877125703, "val/loss_avg_len_2048": 7.168872293418506, "val/perplexity_len_2048": 1298.379585700498, "val/loss_avg_len_1024": 7.169298829473462, "val/perplexity_len_1024": 1298.933509532663, "val/loss_avg_len_512": 7.17260874950029, "val/perplexity_len_512": 1303.2399987050917}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 599.4013552149991, "val/train_update_time": 417.94741392938886, "val/loss": 6.680456670384901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34830521600088, "val/val_tokens_per_second": 453356.5948146407, "val/loss_avg_len_2048": 6.680456670384901, "val/perplexity_len_2048": 796.6828504192507, "val/loss_avg_len_1024": 6.681968356456887, "val/perplexity_len_1024": 797.8880955346282, "val/loss_avg_len_512": 6.6880630861138926, "val/perplexity_len_512": 802.7658569931743}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 827.9917874370003, "val/train_update_time": 555.9067850944703, "val/loss": 6.256492450360163, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.29714147699997, "val/val_tokens_per_second": 453613.47358302725, "val/loss_avg_len_2048": 6.256492450360163, "val/perplexity_len_2048": 521.3869384996046, "val/loss_avg_len_1024": 6.25937858268139, "val/perplexity_len_1024": 522.8939037992483, "val/loss_avg_len_512": 6.268213871597686, "val/perplexity_len_512": 527.5342919101196}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1056.5339453950291, "val/train_update_time": 693.8810286904918, "val/loss": 5.9596897887737725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28463066503173, "val/val_tokens_per_second": 453676.33115726174, "val/loss_avg_len_2048": 5.9596897887737725, "val/perplexity_len_2048": 387.48990187397294, "val/loss_avg_len_1024": 5.963750460020918, "val/perplexity_len_1024": 389.0665699760066, "val/loss_avg_len_512": 5.9747771193729715, "val/perplexity_len_512": 393.38041444619915}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1285.4777986719855, "val/train_update_time": 831.8405811304692, "val/loss": 5.729621500730747, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.29542479297379, "val/val_tokens_per_second": 453622.09761913924, "val/loss_avg_len_2048": 5.729621500730747, "val/perplexity_len_2048": 307.8527242916948, "val/loss_avg_len_1024": 5.73466736189276, "val/perplexity_len_1024": 309.4100320720618, "val/loss_avg_len_512": 5.747293829907757, "val/perplexity_len_512": 313.34155634560165}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1514.0213319549803, "val/train_update_time": 969.8139603384188, "val/loss": 5.54191019657671, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.56256327196024, "val/val_tokens_per_second": 452284.01803288993, "val/loss_avg_len_2048": 5.54191019657671, "val/perplexity_len_2048": 255.1649494383086, "val/loss_avg_len_1024": 5.5479404277496975, "val/perplexity_len_1024": 256.70830177953565, "val/loss_avg_len_512": 5.5618576472472405, "val/perplexity_len_512": 260.3059440825885}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1742.8283430220326, "val/train_update_time": 1107.798082921363, "val/loss": 5.395747513790498, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.40836303203832, "val/val_tokens_per_second": 453055.43233301176, "val/loss_avg_len_2048": 5.395747513790498, "val/perplexity_len_2048": 220.46688755473716, "val/loss_avg_len_1024": 5.40283216586914, "val/perplexity_len_1024": 222.03436470678773, "val/loss_avg_len_512": 5.417992734318786, "val/perplexity_len_512": 225.42617783355703}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1971.4918935780297, "val/train_update_time": 1245.7879514921806, "val/loss": 5.257520105597726, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32240011100657, "val/val_tokens_per_second": 453486.62070162, "val/loss_avg_len_2048": 5.257520105597726, "val/perplexity_len_2048": 192.0047489041577, "val/loss_avg_len_1024": 5.265500482419599, "val/perplexity_len_1024": 193.54314949562067, "val/loss_avg_len_512": 5.282038998350409, "val/perplexity_len_512": 196.77068168657516}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2200.08279568702, "val/train_update_time": 1383.788816100161, "val/loss": 5.150704617314763, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.40961887600133, "val/val_tokens_per_second": 453049.1391206669, "val/loss_avg_len_2048": 5.150704617314763, "val/perplexity_len_2048": 172.55303134546992, "val/loss_avg_len_1024": 5.1593652144801805, "val/perplexity_len_1024": 174.0539336132167, "val/loss_avg_len_512": 5.177391785788723, "val/perplexity_len_512": 177.21998000419174}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2429.163681267004, "val/train_update_time": 1521.7770175782498, "val/loss": 5.0635993114376445, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.79174483998213, "val/val_tokens_per_second": 451142.3375791581, "val/loss_avg_len_2048": 5.0635993114376445, "val/perplexity_len_2048": 158.15875569300152, "val/loss_avg_len_1024": 5.0730407805304045, "val/perplexity_len_1024": 159.6590781757551, "val/loss_avg_len_512": 5.092240632939898, "val/perplexity_len_512": 162.75412606608745}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2658.232374906016, "val/train_update_time": 1659.7850940762437, "val/loss": 4.98549556239089, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 91.04536901297979, "val/val_tokens_per_second": 449885.5948857825, "val/loss_avg_len_2048": 4.98549556239089, "val/perplexity_len_2048": 146.2760459748089, "val/loss_avg_len_1024": 4.995756369349081, "val/perplexity_len_1024": 147.78468292514813, "val/loss_avg_len_512": 5.016161771441624, "val/perplexity_len_512": 150.8312664737655}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2887.563738073979, "val/train_update_time": 1797.8180341873667, "val/loss": 4.916477123672562, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.70920936204493, "val/val_tokens_per_second": 451552.82785585296, "val/loss_avg_len_2048": 4.916477123672562, "val/perplexity_len_2048": 136.5208190724984, "val/loss_avg_len_1024": 4.927128413101426, "val/perplexity_len_1024": 137.98271353908035, "val/loss_avg_len_512": 4.948208645739966, "val/perplexity_len_512": 140.92229592495846}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3116.5153146539815, "val/train_update_time": 1935.8127364134416, "val/loss": 4.863091215804801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.4004945270135, "val/val_tokens_per_second": 453094.86650828354, "val/loss_avg_len_2048": 4.863091215804801, "val/perplexity_len_2048": 129.42366084863215, "val/loss_avg_len_1024": 4.874493102245079, "val/perplexity_len_1024": 130.9077795303594, "val/loss_avg_len_512": 4.896728463353682, "val/perplexity_len_512": 133.85116361495074}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3345.161585650989, "val/train_update_time": 2073.822410382272, "val/loss": 4.811523659892753, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.22876474499935, "val/val_tokens_per_second": 453957.2287813026, "val/loss_avg_len_2048": 4.811523659892753, "val/perplexity_len_2048": 122.91876129597873, "val/loss_avg_len_1024": 4.8232065941833895, "val/perplexity_len_1024": 124.36323452041226, "val/loss_avg_len_512": 4.846166890252475, "val/perplexity_len_512": 127.2516841422241}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3574.040140778001, "val/train_update_time": 2211.8163213434746, "val/loss": 4.760587245357363, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51972289703554, "val/val_tokens_per_second": 452498.07101808325, "val/loss_avg_len_2048": 4.760587245357363, "val/perplexity_len_2048": 116.8145045362375, "val/loss_avg_len_1024": 4.77283736684951, "val/perplexity_len_1024": 118.25429722128945, "val/loss_avg_len_512": 4.796683278769628, "val/perplexity_len_512": 121.10806894510807}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3802.836212512979, "val/train_update_time": 2349.8270930235158, "val/loss": 4.719228506370658, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.55211566400249, "val/val_tokens_per_second": 452336.2010887061, "val/loss_avg_len_2048": 4.719228506370658, "val/perplexity_len_2048": 112.08174894828639, "val/loss_avg_len_1024": 4.73204894817751, "val/perplexity_len_1024": 113.52793706523403, "val/loss_avg_len_512": 4.756577379063424, "val/perplexity_len_512": 116.3470318696743}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4031.6405439740047, "val/train_update_time": 2487.827474080492, "val/loss": 4.676367494543736, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48990405898076, "val/val_tokens_per_second": 452647.18120711594, "val/loss_avg_len_2048": 4.676367494543736, "val/perplexity_len_2048": 107.37930735283909, "val/loss_avg_len_1024": 4.689829182334012, "val/perplexity_len_1024": 108.8345873493113, "val/loss_avg_len_512": 4.7154578478252525, "val/perplexity_len_512": 111.65992272494637}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4260.37410283502, "val/train_update_time": 2625.818530491437, "val/loss": 4.640193889026716, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.78033580299234, "val/val_tokens_per_second": 451199.03597723704, "val/loss_avg_len_2048": 4.640193889026716, "val/perplexity_len_2048": 103.56442564245071, "val/loss_avg_len_1024": 4.654089609145093, "val/perplexity_len_1024": 105.01357307089665, "val/loss_avg_len_512": 4.680419558078237, "val/perplexity_len_512": 107.81529786259468}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4489.443470049009, "val/train_update_time": 2763.8374379616, "val/loss": 4.608071265847772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.74466375104384, "val/val_tokens_per_second": 451376.40393238916, "val/loss_avg_len_2048": 4.608071265847772, "val/perplexity_len_2048": 100.29052920641857, "val/loss_avg_len_1024": 4.622682944629249, "val/perplexity_len_1024": 101.76670061160816, "val/loss_avg_len_512": 4.650229472655617, "val/perplexity_len_512": 104.60898772530528}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4718.872143707995, "val/train_update_time": 2901.8397621414624, "val/loss": 4.577349349257373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.85865825600922, "val/val_tokens_per_second": 450810.09103819757, "val/loss_avg_len_2048": 4.577349349257373, "val/perplexity_len_2048": 97.25625986875657, "val/loss_avg_len_1024": 4.592617217212357, "val/perplexity_len_1024": 98.75254910922418, "val/loss_avg_len_512": 4.621059415361099, "val/perplexity_len_512": 101.60161344282939}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4947.998132903012, "val/train_update_time": 3039.859961154347, "val/loss": 4.549797477854183, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51020548597444, "val/val_tokens_per_second": 452545.65250486817, "val/loss_avg_len_2048": 4.549797477854183, "val/perplexity_len_2048": 94.61324509708179, "val/loss_avg_len_1024": 4.565505847024965, "val/perplexity_len_1024": 96.11119928630141, "val/loss_avg_len_512": 4.594841379802302, "val/perplexity_len_512": 98.97243527526068}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5176.88071361999, "val/train_update_time": 3177.899590641551, "val/loss": 4.5204342533537885, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.75762628903612, "val/val_tokens_per_second": 451311.9356995361, "val/loss_avg_len_2048": 4.5204342533537885, "val/perplexity_len_2048": 91.87548655482323, "val/loss_avg_len_1024": 4.536794685186399, "val/perplexity_len_1024": 93.39097238779135, "val/loss_avg_len_512": 4.567255290885735, "val/perplexity_len_512": 96.27948759639831}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5405.913057841011, "val/train_update_time": 3315.925435980549, "val/loss": 4.492667575135734, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.73625983402599, "val/val_tokens_per_second": 451418.2100399961, "val/loss_avg_len_2048": 4.492667575135734, "val/perplexity_len_2048": 89.35950140608207, "val/loss_avg_len_1024": 4.509617001681402, "val/perplexity_len_1024": 90.88700227462165, "val/loss_avg_len_512": 4.5411422349753785, "val/perplexity_len_512": 93.7978781707472}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5634.922143466014, "val/train_update_time": 3453.9653959476273, "val/loss": 4.4686831552135295, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.4348212070181, "val/val_tokens_per_second": 452922.88361179776, "val/loss_avg_len_2048": 4.4686831552135295, "val/perplexity_len_2048": 87.24176347672332, "val/loss_avg_len_1024": 4.486252108311607, "val/perplexity_len_1024": 88.78805350190629, "val/loss_avg_len_512": 4.51881691169506, "val/perplexity_len_512": 91.72701260192157}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5864.05088400899, "val/train_update_time": 3591.9979424396297, "val/loss": 4.4460520937834875, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.54608423600439, "val/val_tokens_per_second": 452366.33196902875, "val/loss_avg_len_2048": 4.4460520937834875, "val/perplexity_len_2048": 85.28956326961926, "val/loss_avg_len_1024": 4.464425405966584, "val/perplexity_len_1024": 86.87109958090244, "val/loss_avg_len_512": 4.498216118935217, "val/perplexity_len_512": 89.85669458704189}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6092.851395011006, "val/train_update_time": 3730.020544492756, "val/loss": 4.420050854198402, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32853831601096, "val/val_tokens_per_second": 453455.8043738402, "val/loss_avg_len_2048": 4.420050854198402, "val/perplexity_len_2048": 83.10051126077735, "val/loss_avg_len_1024": 4.439237378784268, "val/perplexity_len_1024": 84.71031515065614, "val/loss_avg_len_512": 4.474183511526417, "val/perplexity_len_512": 87.7229463868}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6321.474026971031, "val/train_update_time": 3868.0566598027945, "val/loss": 4.398202258219151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34568081604084, "val/val_tokens_per_second": 453369.7641108214, "val/loss_avg_len_2048": 4.398202258219151, "val/perplexity_len_2048": 81.30457257597935, "val/loss_avg_len_1024": 4.418109852228035, "val/perplexity_len_1024": 82.93936944356574, "val/loss_avg_len_512": 4.454481553460378, "val/perplexity_len_512": 86.01154689490046}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6550.097681444022, "val/train_update_time": 4006.072604118788, "val/loss": 4.376139390771115, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32946746400557, "val/val_tokens_per_second": 453451.1400315928, "val/loss_avg_len_2048": 4.376139390771115, "val/perplexity_len_2048": 79.53040415674566, "val/loss_avg_len_1024": 4.396944615813718, "val/perplexity_len_1024": 81.2023847690807, "val/loss_avg_len_512": 4.435031853418239, "val/perplexity_len_512": 84.35481183459801}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6778.730279813986, "val/train_update_time": 4144.1211769738, "val/loss": 4.355563867319981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.37096956197638, "val/val_tokens_per_second": 453242.89645813353, "val/loss_avg_len_2048": 4.355563867319981, "val/perplexity_len_2048": 77.9107442760098, "val/loss_avg_len_1024": 4.377231672070688, "val/perplexity_len_1024": 79.61732119023712, "val/loss_avg_len_512": 4.4167310255174534, "val/perplexity_len_512": 82.82508923002905}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 7007.788445023994, "val/train_update_time": 4282.135617908789, "val/loss": 4.335396474500792, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.33323211199604, "val/val_tokens_per_second": 453432.2424024127, "val/loss_avg_len_2048": 4.335396474500792, "val/perplexity_len_2048": 76.35522578937196, "val/loss_avg_len_1024": 4.358097750052391, "val/perplexity_len_1024": 78.10841129235861, "val/loss_avg_len_512": 4.399293633644097, "val/perplexity_len_512": 81.3933548269956}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7236.399417778011, "val/train_update_time": 4420.153171113925, "val/loss": 4.31625511668981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.29227029101457, "val/val_tokens_per_second": 453637.94561798865, "val/loss_avg_len_2048": 4.31625511668981, "val/perplexity_len_2048": 74.90758222363428, "val/loss_avg_len_1024": 4.339928405715012, "val/perplexity_len_1024": 76.70204771345033, "val/loss_avg_len_512": 4.382818944332655, "val/perplexity_len_512": 80.06340988951868}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7465.047058130032, "val/train_update_time": 4558.262438459904, "val/loss": 4.2996819401991555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.42316870903596, "val/val_tokens_per_second": 452981.25010196504, "val/loss_avg_len_2048": 4.2996819401991555, "val/perplexity_len_2048": 73.67635648530485, "val/loss_avg_len_1024": 4.324086923091021, "val/perplexity_len_1024": 75.49654722507492, "val/loss_avg_len_512": 4.368286226595659, "val/perplexity_len_512": 78.90828483582929}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7693.75585325103, "val/train_update_time": 4696.292382065032, "val/loss": 4.283236857734924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48378002701793, "val/val_tokens_per_second": 452677.8168172194, "val/loss_avg_len_2048": 4.283236857734924, "val/perplexity_len_2048": 72.47465088349, "val/loss_avg_len_1024": 4.308458720062673, "val/perplexity_len_1024": 74.32584368113129, "val/loss_avg_len_512": 4.3540999811033725, "val/perplexity_len_512": 77.7967752505338}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 7922.504303377005, "val/train_update_time": 4834.2961148990435, "val/loss": 4.267898958559893, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.49312097602524, "val/val_tokens_per_second": 452631.0901670827, "val/loss_avg_len_2048": 4.267898958559893, "val/perplexity_len_2048": 71.371523450084, "val/loss_avg_len_1024": 4.29405301307696, "val/perplexity_len_1024": 73.26280266811683, "val/loss_avg_len_512": 4.3412228272167965, "val/perplexity_len_512": 76.80139677915308}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8151.724266513018, "val/train_update_time": 4972.329616118164, "val/loss": 4.25385695036254, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.50966985698324, "val/val_tokens_per_second": 452548.33063386485, "val/loss_avg_len_2048": 4.25385695036254, "val/perplexity_len_2048": 70.3763275596729, "val/loss_avg_len_1024": 4.280959435730102, "val/perplexity_len_1024": 72.30978332653642, "val/loss_avg_len_512": 4.3297005797375, "val/perplexity_len_512": 75.92155071492496}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8380.818926771986, "val/train_update_time": 5110.374592272972, "val/loss": 4.241492192271679, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.75462737603812, "val/val_tokens_per_second": 451326.8489361309, "val/loss_avg_len_2048": 4.241492192271679, "val/perplexity_len_2048": 69.51149901038498, "val/loss_avg_len_1024": 4.26916799461185, "val/perplexity_len_1024": 71.46215398096777, "val/loss_avg_len_512": 4.318984106020722, "val/perplexity_len_512": 75.11228340295357}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8610.333553528006, "val/train_update_time": 5248.407137244998, "val/loss": 4.230574601543066, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34580848604674, "val/val_tokens_per_second": 453369.1234422456, "val/loss_avg_len_2048": 4.230574601543066, "val/perplexity_len_2048": 68.75672854774076, "val/loss_avg_len_1024": 4.259278581639892, "val/perplexity_len_1024": 70.75891825403492, "val/loss_avg_len_512": 4.310739070640505, "val/perplexity_len_512": 74.49552605583897}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 8839.131734684983, "val/train_update_time": 5386.445536848798, "val/loss": 4.220437906114012, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.38471865397878, "val/val_tokens_per_second": 453173.9503090982, "val/loss_avg_len_2048": 4.220437906114012, "val/perplexity_len_2048": 68.06328309221031, "val/loss_avg_len_1024": 4.2494930887183635, "val/perplexity_len_1024": 70.0698841278538, "val/loss_avg_len_512": 4.301655115112848, "val/perplexity_len_512": 73.82187634450761}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9067.837796505017, "val/train_update_time": 5524.4823192786425, "val/loss": 4.212082446529414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44228035700507, "val/val_tokens_per_second": 452885.5291830056, "val/loss_avg_len_2048": 4.212082446529414, "val/perplexity_len_2048": 67.49695235274098, "val/loss_avg_len_1024": 4.241754451114219, "val/perplexity_len_1024": 69.52973140635773, "val/loss_avg_len_512": 4.295026430321672, "val/perplexity_len_512": 73.33415266465448}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9296.993160478014, "val/train_update_time": 5662.513914024632, "val/loss": 4.20452369724242, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.35833089798689, "val/val_tokens_per_second": 453306.29276721796, "val/loss_avg_len_2048": 4.20452369724242, "val/perplexity_len_2048": 66.9886831719058, "val/loss_avg_len_1024": 4.23449458040651, "val/perplexity_len_1024": 69.02678242730825, "val/loss_avg_len_512": 4.288312720157765, "val/perplexity_len_512": 72.84345745438628}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9525.670015413023, "val/train_update_time": 5800.553074025724, "val/loss": 4.198343608177501, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.72886659303913, "val/val_tokens_per_second": 451454.99484441394, "val/loss_avg_len_2048": 4.198343608177501, "val/perplexity_len_2048": 66.57596377846592, "val/loss_avg_len_1024": 4.228689314186201, "val/perplexity_len_1024": 68.62722447126744, "val/loss_avg_len_512": 4.283119718784839, "val/perplexity_len_512": 72.46616177619002}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 9754.735574804014, "val/train_update_time": 5938.594816011784, "val/loss": 4.193367760937754, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.7492567790323, "val/val_tokens_per_second": 451353.5587375064, "val/loss_avg_len_2048": 4.193367760937754, "val/perplexity_len_2048": 66.24551476656741, "val/loss_avg_len_1024": 4.224034787101439, "val/perplexity_len_1024": 68.30853943562786, "val/loss_avg_len_512": 4.2790641182546505, "val/perplexity_len_512": 72.17286312516411}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 9983.788254795014, "val/train_update_time": 6076.624669670709, "val/loss": 4.189421963141696, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.6789614429581, "val/val_tokens_per_second": 451703.4530194308, "val/loss_avg_len_2048": 4.189421963141696, "val/perplexity_len_2048": 65.98463838160738, "val/loss_avg_len_1024": 4.22020412462051, "val/perplexity_len_1024": 68.04737301604953, "val/loss_avg_len_512": 4.2755115066579545, "val/perplexity_len_512": 71.9169158844202}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10212.780399380019, "val/train_update_time": 6214.659892122727, "val/loss": 4.186491362652555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.99573383899406, "val/val_tokens_per_second": 450130.99265152105, "val/loss_avg_len_2048": 4.186491362652555, "val/perplexity_len_2048": 65.79154684336461, "val/loss_avg_len_1024": 4.217514740810637, "val/perplexity_len_1024": 67.86461337831629, "val/loss_avg_len_512": 4.273214724269416, "val/perplexity_len_512": 71.75192792183111}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10442.503988512035, "val/train_update_time": 6352.692734938697, "val/loss": 4.184350719433324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.35145655297674, "val/val_tokens_per_second": 453340.7823479136, "val/loss_avg_len_2048": 4.184350719433324, "val/perplexity_len_2048": 65.65086124728785, "val/loss_avg_len_1024": 4.215543735674023, "val/perplexity_len_1024": 67.73098361249181, "val/loss_avg_len_512": 4.271550920667592, "val/perplexity_len_512": 71.63264606402603}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 10671.172117165988, "val/train_update_time": 6490.743322437804, "val/loss": 4.183023557277815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.35044766304782, "val/val_tokens_per_second": 453345.84453588846, "val/loss_avg_len_2048": 4.183023557277815, "val/perplexity_len_2048": 65.56378970057509, "val/loss_avg_len_1024": 4.214286808201578, "val/perplexity_len_1024": 67.64590415900594, "val/loss_avg_len_512": 4.270415190260951, "val/perplexity_len_512": 71.5513368711842}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 10899.877774502034, "val/train_update_time": 6628.827098570764, "val/loss": 4.18224084139755, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.3038979700068, "val/val_tokens_per_second": 453579.5344471653, "val/loss_avg_len_2048": 4.18224084139755, "val/perplexity_len_2048": 65.51249195960324, "val/loss_avg_len_1024": 4.213500716115302, "val/perplexity_len_1024": 67.59274914418302, "val/loss_avg_len_512": 4.269666781060863, "val/perplexity_len_512": 71.4978072259293}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 11128.507790005999, "val/train_update_time": 6766.880580478697, "val/loss": 4.1819093990348515, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.11226414598059, "val/val_tokens_per_second": 454544.1221368645, "val/loss_avg_len_2048": 4.1819093990348515, "val/perplexity_len_2048": 65.49078194249032, "val/loss_avg_len_1024": 4.213224817466876, "val/perplexity_len_1024": 67.57410296839636, "val/loss_avg_len_512": 4.269427753666788, "val/perplexity_len_512": 71.48071933370453}
metrics/npz/train_eval/step-000000104857600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b0411bfacac5fd7ddc541112213c8a6406326c78d14ee0ab7286ce25f70005a
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9689eac5628c24835fc067a08c12aec111f39bde034cf3fd21c7668771f4d7db
3
  size 20540
metrics/npz/train_eval/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:818c639b2f14c5f6adbc3d6da64c5cf0b3238ed0d2521a265dfe59f628b76145
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ec2247a03177429e55dd2b9ceda3d231cc2e0960b9395ad6b32fc0d519d6741
3
  size 20540
metrics/npz/train_eval/step-000000314572800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08fbd679b6aef141a8dfab41fd0574e0832b2a1d1dccf1e007b60cbd5da1718c
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91485872829a42b446fc33636ca81aaf924457b2225ad8734e1a014600c82608
3
  size 20540
metrics/npz/train_eval/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf9ca3539d451112910ac61b54f796b007d7fa0cf648d3e460d2806c590f90da
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:084a127f64a422e079a1e1cb823e16982b71cfde6efe327926ff8443a7d1103f
3
  size 20540
metrics/npz/train_eval/step-000000524288000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2190fe91c1097b60aa4a844f04e94fe9262376dbef9400231362b8d7e117f4b1
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:093682b0a72ed9596ee5ad8f1d9c072dc8bfd1109b841178fffc0079f6da747d
3
  size 20540
metrics/npz/train_eval/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a72d6d74f41c49d393e329546b3f029f9e716dece995ff5cb6d6cb1e255b2d57
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55cc048b994dd7b6f840628704ea5078a41b109fc5b316dd01e86db1354d68c1
3
  size 20540
metrics/npz/train_eval/step-000000734003200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db040ef998c8d4dec18925c3fefd45301c5ef5af7ac8968b844ce0ebde51a0ed
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0bd6cd6832f911c8cb316614d97a5a78e3ccf57f81bb8661eedae16a00d3a5
3
  size 20540
metrics/npz/train_eval/step-000000838860800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e95dd778dbf9c2fe759be7766d728b1dd2d9eee4e9fdba2002497f0e0eebbc5
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3beb5c9d3934a6b0b781e5f1379dde43a64d949bb7c2190f5e290f31560510ee
3
  size 20540
metrics/npz/train_eval/step-000000943718400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84ac5ae2cd66fac8d310e43493094056cdfaed647092e7c92e4db764f24e3b2f
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86053e422f63b1997d274b16a3f83ac7569fa79aa471be938b2586c36625db40
3
  size 20540
metrics/npz/train_eval/step-000001048576000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5a1d5580f26f423ab74b06d9d413f63cc0be2088366ee9aef2440d14ee70dc7
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2d29d0306bec224d18de5a45f0464691fd99709d9f57efcb60b83827a2a070
3
  size 20540
metrics/npz/train_eval/step-000001153433600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c46dca58e16e3003f2bdec52117adbff020fa08ca56aa1178e3b5bcf00dddba1
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32694215f459813a770d1887234917ad61c859451ab2759652a5239be3709c55
3
  size 20540
metrics/npz/train_eval/step-000001258291200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22987225fdce2bc77dd4032d46a581cfa0c34a1802c4f4618c88b71dd37edc3b
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43801118158369cc7393c60089d61f2bfcebf85ed25ad1584c6f61643f972761
3
  size 20540
metrics/npz/train_eval/step-000001363148800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:169d1d55bbdd47aa1e11030c56bfeb0d7cba9014c91533e95a7dbd2952450008
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09815e89a3add650dfbb04ce74b556c9a72f6fa9f26a0a8ee7c66e5f57f16ab3
3
  size 20540
metrics/npz/train_eval/step-000001468006400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69850547fa0886a0a8b6266de91fc50f20647ee4d7b2f7b33ba6e3b50ba9310d
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b329cd54e4297337c8d58c891196e499b4dd37f393f24917c51974a20ddeda4c
3
  size 20540
metrics/npz/train_eval/step-000001572864000.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0287eeb6284e0acc28e88bf55f564b44532418098077578f6e68fb39e941cb2
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d8780b119443b7964d4fbea7e1aeb86d8b6e3042dbd5d9897f0aeed4a3904db
3
  size 20540
metrics/npz/train_eval/step-000001677721600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96e4ed572a0befdee05e636d7246599108af4e61cace986daf371778556726bf
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93492038097cdd944462fc5be8ff53d759259b491d011c3186149fb4733afcc5
3
  size 20540
metrics/npz/train_eval/step-000001782579200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f5ab63584eb93d8f8f8532442ce94271da31d6856e7ce102c28f5642e4d332f
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85b31700063c6174b8eba53f2a2f4092996bed090528645ca7f551df1d469b5d
3
  size 20540
metrics/npz/train_eval/step-000001887436800.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8553373160927c66c07b8aab6d901e46a4f66e8f813838a2200151e506155c9
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c088500cbce0377fa6b3e8ae84045fad4d537c103ad6b9f68f22caec46d05b50
3
  size 20540
metrics/npz/train_eval/step-000001992294400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f7231b68c0b2317414523daf7f4263e4d2a0e0d026399f735e4c4756b54d21b
3
  size 20540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e567eae3a2690aad2c2995b320aa57745942585bb3dfffd8b8b12597b5d42990
3
  size 20540
metrics/npz/val/step-000000041943040.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c51b646717ef92f454aec29bd952e3ddb0ff82fbee6a76065bd894f1981c6e0
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:265ebd6b9bde03fad19b9aae2da0ee92c37b0e2b3dbc1950a6c290ebb70dfbe6
3
  size 21142
metrics/npz/val/step-000000083886080.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14af7ec8115b0a0ecea98713115058c22897d1d514760f5635dfdacaeb5b2582
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9039a3eed82c460d492611be8fcc27793c9fbe43961bbb8927c70bd1cba92fcd
3
  size 21142
metrics/npz/val/step-000000125829120.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7395994e6d20b101de8d3704061d19ec9f8370dca3eb921026e60e7976f8ba30
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f70311d41a03c97e86ff8121f0cc6cd7718d0f5d2bf971e20c9c948788c426a
3
  size 21142
metrics/npz/val/step-000000167772160.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bed7f7befde116cee3ab6189fbde0bf4095cbad7e2b6dc84cbdd225379051af5
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45e91b34732c55786979df2ad3607a299a35b28500234a086cb9872a03256017
3
  size 21142
metrics/npz/val/step-000000209715200.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69524026ec7c15467a1afbab51989a97f08acae504c19f8dc9015662a643d274
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97d504910bf1a7e842b7815c694e883f77922067729b8db97ce8c12fa8eb1df3
3
  size 21142
metrics/npz/val/step-000000251658240.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e05da9235305c3b58188a60b3a54be0badf5c9bc2d0d6f1022c13cb5f0bb8fde
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:099974346431907422f6336139eae6e7c28fe7d2a6d239fac706344d308f83da
3
  size 21142
metrics/npz/val/step-000000293601280.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fac5a45dab6949134d90f984062376c2e82ca3a182ab00405d4b3c92230e5ce
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c443f47445e603996baef79dc33f7a8ba7ddc97ef5768399e25434f444d38842
3
  size 21142
metrics/npz/val/step-000000335544320.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbc136acb87b980ec8ea56b44c57fc05f5daef8ea8c92508dd32e2dc6d2fef37
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb50ab44c317467b2371c4780ae8dd68fbebecee528f2962ef91d0a00261816
3
  size 21142
metrics/npz/val/step-000000377487360.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d748befb8db8aa12be1c2178799a7b4b10f3f577afb1c589f45c67c117176606
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0bb42606b7b156ace7a718b7e9f7e33c9dd0325c36727a0cc691668c841e34
3
  size 21142
metrics/npz/val/step-000000419430400.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baf269266a21e1d0df49a77bf22c6baac7ee1f3eb8687e62594418749e63f7ae
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc6d937e8cb1de87260bdd38940718e7a857f78649c680ec4c5c5f6ec33a611e
3
  size 21142
metrics/npz/val/step-000000461373440.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61d6f3bf8a48c6ae13ceb908decc179c5fa66cf12c06a44072c335a848c988d5
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a56135666378ed61b59377e19614bf2f2be90cd92ab879fc59ef1293db99b6
3
  size 21142
metrics/npz/val/step-000000503316480.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e23144390aaa5d647f838b9abdd807897ea26140191aff999595c471d038b202
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:015ceb1f82664e7252f6aba9eea58abae89d0e18a3e89d6f3f397eb093f65078
3
  size 21142
metrics/npz/val/step-000000545259520.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68c80ec0e7634f240900a46686eb9c4d5c13c8fdae4b955fb4ef3c338fda87e8
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29fe2be6d0a8e489c3dce27cfdd8321de5f7ec273b58ebc6a01e99c06b5b593c
3
  size 21142
metrics/npz/val/step-000000587202560.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb35538645b4627d3701429b0dd54c92adbacd0c7038c8ac02dcfc721b8e6f5c
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f03ae34480069799b507808a22e85ca3fb0927d25e49edcf2f3ea7547f200c0e
3
  size 21142
metrics/npz/val/step-000000629145600.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16fe72afba4903609c1619a7178029c2290d4f256034c26d1ae4c9b4c57320d1
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91b74904fd335c0978272b1f7c08464c1ce99cd005f7fac9f67794f1406de39c
3
  size 21142
metrics/npz/val/step-000000671088640.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4daf48ce2898d93b3d606bbcdc3f2a70b01f349c497367c4d338e8fb61f8b4ab
3
  size 21142
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25e23b258f6f1656e3498984ef6a47393958cc5cc5f2b28bc1930464dd1fea41
3
  size 21142