Lanni-ni commited on
Commit
0fe45e5
·
verified ·
1 Parent(s): a2f239e

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .hydra/config.yaml +102 -0
  2. .hydra/hydra.yaml +146 -0
  3. .hydra/overrides.yaml +16 -0
  4. __init__.py +1 -0
  5. __pycache__/__init__.cpython-310.pyc +0 -0
  6. __pycache__/configuration_transformer.cpython-310.pyc +0 -0
  7. __pycache__/modeling_transformer.cpython-310.pyc +0 -0
  8. checkpoints/step-000000209715200.pt +3 -0
  9. checkpoints/step-000000209715200.pt.done +0 -0
  10. checkpoints/step-000000209715200.pt.keep +0 -0
  11. checkpoints/step-000000419430400.pt +3 -0
  12. checkpoints/step-000000419430400.pt.done +0 -0
  13. checkpoints/step-000000419430400.pt.keep +0 -0
  14. checkpoints/step-000000629145600.pt +3 -0
  15. checkpoints/step-000000629145600.pt.done +0 -0
  16. checkpoints/step-000000629145600.pt.keep +0 -0
  17. checkpoints/step-000000838860800.pt +3 -0
  18. checkpoints/step-000000838860800.pt.done +0 -0
  19. checkpoints/step-000000838860800.pt.keep +0 -0
  20. checkpoints/step-000001048576000.pt +3 -0
  21. checkpoints/step-000001048576000.pt.done +0 -0
  22. checkpoints/step-000001048576000.pt.keep +0 -0
  23. checkpoints/step-000001258291200.pt +3 -0
  24. checkpoints/step-000001258291200.pt.done +0 -0
  25. checkpoints/step-000001258291200.pt.keep +0 -0
  26. checkpoints/step-000001468006400.pt +3 -0
  27. checkpoints/step-000001468006400.pt.done +0 -0
  28. checkpoints/step-000001468006400.pt.keep +0 -0
  29. checkpoints/step-000001677721600.pt +3 -0
  30. checkpoints/step-000001677721600.pt.done +0 -0
  31. checkpoints/step-000001677721600.pt.keep +0 -0
  32. checkpoints/step-000001887436800.pt +3 -0
  33. checkpoints/step-000001887436800.pt.done +0 -0
  34. checkpoints/step-000001887436800.pt.keep +0 -0
  35. config.yaml +102 -0
  36. configuration_transformer.py +67 -0
  37. decay_params.txt +23 -0
  38. logs/2025-10-17_04-14-45.log +258 -0
  39. metrics/jsonlines/checkpoint.jsonl +9 -0
  40. metrics/jsonlines/model_info.jsonl +1 -0
  41. metrics/jsonlines/norm.jsonl +0 -0
  42. metrics/jsonlines/resume.jsonl +1 -0
  43. metrics/jsonlines/throughput.jsonl +0 -0
  44. metrics/jsonlines/train.jsonl +98 -0
  45. metrics/jsonlines/train_data_info.jsonl +1 -0
  46. metrics/jsonlines/train_eval.jsonl +19 -0
  47. metrics/jsonlines/val.jsonl +49 -0
  48. metrics/jsonlines/val_data_info.jsonl +1 -0
  49. metrics/npz/train_eval/step-000000104857600.npz +3 -0
  50. metrics/npz/train_eval/step-000000209715200.npz +3 -0
.hydra/config.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM
3
+ config:
4
+ _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig
5
+ vocab_size: ???
6
+ hidden_size: 256
7
+ hidden_ratio: 4.0
8
+ intermediate_size: null
9
+ num_hidden_layers: 3
10
+ num_heads: 4
11
+ num_kv_heads: null
12
+ hidden_act: swish
13
+ window_size: null
14
+ max_position_embeddings: null
15
+ initializer_range: 0.02
16
+ elementwise_affine: true
17
+ norm_eps: 1.0e-06
18
+ use_cache: true
19
+ pad_token_id: null
20
+ bos_token_id: null
21
+ eos_token_id: null
22
+ tie_word_embeddings: false
23
+ attention_bias: false
24
+ fuse_norm: true
25
+ fuse_cross_entropy: true
26
+ rope_base: 500000.0
27
+ use_rope: false
28
+ use_output_gate: false
29
+ ogate_act: sigmoid
30
+ fgate_type: full
31
+ fgate_bias_init: false
32
+ decay_time_min: null
33
+ decay_time_max: null
34
+ use_output_norm: false
35
+ qk_norm: false
36
+ qk_norm_share_param_across_head: false
37
+ use_k_shift: false
38
+ use_v_shift: false
39
+ optimizer:
40
+ _target_: torch.optim.AdamW
41
+ lr: 0.001
42
+ betas:
43
+ - 0.9
44
+ - 0.95
45
+ weight_decay: 0.1
46
+ schedule:
47
+ _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule
48
+ init_value: 0.0
49
+ peak_value: ${optimizer.lr}
50
+ warmup_steps: 20971520
51
+ decay_steps: ${train.max_tokens}
52
+ end_value: 0.0
53
+ datamodule:
54
+ _target_: forgetting_transformer.datamodule.npy.NpyDataModule
55
+ data_path: ${data_dir}
56
+ rank: ???
57
+ world_size: ???
58
+ train_batch_len: 2048
59
+ train_batch_size: 1024
60
+ train_num_workers: 0
61
+ eval_tokens: 2147483648
62
+ eval_batch_len: 2048
63
+ eval_local_batch_size: 1
64
+ eval_num_workers: 0
65
+ strategy:
66
+ _target_: lightning.fabric.strategies.SingleDeviceStrategy
67
+ device: cuda:0
68
+ exp: forgetting_gate_3_4_256
69
+ tag: forgetting_gate_3_4_256
70
+ seed: 42
71
+ hf_load_dir: null
72
+ hf_save_dir: null
73
+ hf_load_step: null
74
+ output_dir: ./forgetting_gate_3_4_256/
75
+ data_dir: /workspace/forgetting-transformer/data
76
+ resume: false
77
+ fork_dir: null
78
+ fork_step: null
79
+ log_interval: 20971520
80
+ eval_interval: 41943040
81
+ final_eval: true
82
+ skip_eval: false
83
+ checkpoint_interval: 209715200
84
+ train_eval_interval: 104857600
85
+ checkpoint_keep_interval: 209715200
86
+ fabric:
87
+ devices: 1
88
+ precision: 16-mixed
89
+ train:
90
+ max_tokens: 2097152000
91
+ grad_acc_tokens: 32768
92
+ max_grad_norm: 1.0
93
+ gradient_checkpointing: true
94
+ bias_weight_decay: false
95
+ normalization_weight_decay: false
96
+ conv_weight_decay: true
97
+ eval:
98
+ min_val_length: 512
99
+ wandb:
100
+ project: forgetting-transformer
101
+ mode: online
102
+ log_dir: ./output/wandb
.hydra/hydra.yaml ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${output_dir}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ root: null
71
+ disable_existing_loggers: false
72
+ job_logging:
73
+ version: 1
74
+ root: null
75
+ disable_existing_loggers: false
76
+ env: {}
77
+ mode: RUN
78
+ searchpath: []
79
+ callbacks: {}
80
+ output_subdir: .hydra
81
+ overrides:
82
+ hydra:
83
+ - hydra.mode=RUN
84
+ task:
85
+ - +experiment/pile/forgetting_transformer=forgetting_gate_3_4_256
86
+ - strategy=single_device
87
+ - datamodule=npy
88
+ - schedule=warmup_cosine
89
+ - optimizer=adamw
90
+ - model=forgetting_transformer
91
+ - data_dir=/workspace/forgetting-transformer/data
92
+ - fabric.devices=1
93
+ - fabric.precision=16-mixed
94
+ - seed=42
95
+ - exp=forgetting_gate_3_4_256
96
+ - tag=forgetting_gate_3_4_256
97
+ - output_dir=./forgetting_gate_3_4_256/
98
+ - wandb.log_dir=./output/wandb
99
+ - wandb.mode=online
100
+ - resume=false
101
+ job:
102
+ name: train
103
+ chdir: null
104
+ override_dirname: +experiment/pile/forgetting_transformer=forgetting_gate_3_4_256,data_dir=/workspace/forgetting-transformer/data,datamodule=npy,exp=forgetting_gate_3_4_256,fabric.devices=1,fabric.precision=16-mixed,model=forgetting_transformer,optimizer=adamw,output_dir=./forgetting_gate_3_4_256/,resume=false,schedule=warmup_cosine,seed=42,strategy=single_device,tag=forgetting_gate_3_4_256,wandb.log_dir=./output/wandb,wandb.mode=online
105
+ id: ???
106
+ num: ???
107
+ config_name: config
108
+ env_set: {}
109
+ env_copy: []
110
+ config:
111
+ override_dirname:
112
+ kv_sep: '='
113
+ item_sep: ','
114
+ exclude_keys: []
115
+ runtime:
116
+ version: 1.3.2
117
+ version_base: '1.3'
118
+ cwd: /workspace/forgetting-transformer
119
+ config_sources:
120
+ - path: hydra.conf
121
+ schema: pkg
122
+ provider: hydra
123
+ - path: /workspace/forgetting-transformer/configs
124
+ schema: file
125
+ provider: main
126
+ - path: ''
127
+ schema: structured
128
+ provider: schema
129
+ output_dir: /workspace/forgetting-transformer/forgetting_gate_3_4_256
130
+ choices:
131
+ experiment/pile/forgetting_transformer: forgetting_gate_3_4_256
132
+ strategy: single_device
133
+ datamodule: npy
134
+ schedule: warmup_cosine
135
+ optimizer: adamw
136
+ model: forgetting_transformer
137
+ hydra/env: default
138
+ hydra/callbacks: null
139
+ hydra/job_logging: none
140
+ hydra/hydra_logging: none
141
+ hydra/hydra_help: default
142
+ hydra/help: default
143
+ hydra/sweeper: basic
144
+ hydra/launcher: basic
145
+ hydra/output: default
146
+ verbose: false
.hydra/overrides.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - +experiment/pile/forgetting_transformer=forgetting_gate_3_4_256
2
+ - strategy=single_device
3
+ - datamodule=npy
4
+ - schedule=warmup_cosine
5
+ - optimizer=adamw
6
+ - model=forgetting_transformer
7
+ - data_dir=/workspace/forgetting-transformer/data
8
+ - fabric.devices=1
9
+ - fabric.precision=16-mixed
10
+ - seed=42
11
+ - exp=forgetting_gate_3_4_256
12
+ - tag=forgetting_gate_3_4_256
13
+ - output_dir=./forgetting_gate_3_4_256/
14
+ - wandb.log_dir=./output/wandb
15
+ - wandb.mode=online
16
+ - resume=false
__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # for HF remote code
__pycache__/__init__.cpython-310.pyc ADDED
Binary file (549 Bytes). View file
 
__pycache__/configuration_transformer.cpython-310.pyc ADDED
Binary file (1.99 kB). View file
 
__pycache__/modeling_transformer.cpython-310.pyc ADDED
Binary file (15.2 kB). View file
 
checkpoints/step-000000209715200.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1274f2e1df87707a9b724935abd53c5efca86ffa3b150421bffbec330569053
3
+ size 339689578
checkpoints/step-000000209715200.pt.done ADDED
File without changes
checkpoints/step-000000209715200.pt.keep ADDED
File without changes
checkpoints/step-000000419430400.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebd148ce1a2007d6c6e91c2b5c75e9b80c75f90f6ac812686b99efb145419ade
3
+ size 339689578
checkpoints/step-000000419430400.pt.done ADDED
File without changes
checkpoints/step-000000419430400.pt.keep ADDED
File without changes
checkpoints/step-000000629145600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bb1c225d185f64f028c41566d892df8a989a4c3731e0d439b71af3f216f2b36
3
+ size 339689578
checkpoints/step-000000629145600.pt.done ADDED
File without changes
checkpoints/step-000000629145600.pt.keep ADDED
File without changes
checkpoints/step-000000838860800.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e64c675b89a0495f021baca1720c2992857a1a64ebdd71492e9c07926508c53d
3
+ size 339689578
checkpoints/step-000000838860800.pt.done ADDED
File without changes
checkpoints/step-000000838860800.pt.keep ADDED
File without changes
checkpoints/step-000001048576000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c425b8a14caa8aea4ab488b3e113a6bc580906bd80d4d6f6e049fa625b05f22e
3
+ size 339689578
checkpoints/step-000001048576000.pt.done ADDED
File without changes
checkpoints/step-000001048576000.pt.keep ADDED
File without changes
checkpoints/step-000001258291200.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc245229fd6b46323707130a8ab7fcc6f03beebf1a0167808f34c6bf81601b8e
3
+ size 339689578
checkpoints/step-000001258291200.pt.done ADDED
File without changes
checkpoints/step-000001258291200.pt.keep ADDED
File without changes
checkpoints/step-000001468006400.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47023f6836a59ce0cef150fd3af7688ed1f8c0ff2e133e2992d4517588084cef
3
+ size 339689578
checkpoints/step-000001468006400.pt.done ADDED
File without changes
checkpoints/step-000001468006400.pt.keep ADDED
File without changes
checkpoints/step-000001677721600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5704653c2cd96b6de81599126d702ff223024acbc2d6ce4f1a42b1573aa2e1ff
3
+ size 339689578
checkpoints/step-000001677721600.pt.done ADDED
File without changes
checkpoints/step-000001677721600.pt.keep ADDED
File without changes
checkpoints/step-000001887436800.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:565c27ce73fbefef4d89083102e930307de2a242fcf95ab7bcd0b738d52fe3fc
3
+ size 339689578
checkpoints/step-000001887436800.pt.done ADDED
File without changes
checkpoints/step-000001887436800.pt.keep ADDED
File without changes
config.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM
3
+ config:
4
+ _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig
5
+ vocab_size: ???
6
+ hidden_size: 256
7
+ hidden_ratio: 4.0
8
+ intermediate_size: null
9
+ num_hidden_layers: 3
10
+ num_heads: 4
11
+ num_kv_heads: null
12
+ hidden_act: swish
13
+ window_size: null
14
+ max_position_embeddings: null
15
+ initializer_range: 0.02
16
+ elementwise_affine: true
17
+ norm_eps: 1.0e-06
18
+ use_cache: true
19
+ pad_token_id: null
20
+ bos_token_id: null
21
+ eos_token_id: null
22
+ tie_word_embeddings: false
23
+ attention_bias: false
24
+ fuse_norm: true
25
+ fuse_cross_entropy: true
26
+ rope_base: 500000.0
27
+ use_rope: false
28
+ use_output_gate: false
29
+ ogate_act: sigmoid
30
+ fgate_type: full
31
+ fgate_bias_init: false
32
+ decay_time_min: null
33
+ decay_time_max: null
34
+ use_output_norm: false
35
+ qk_norm: false
36
+ qk_norm_share_param_across_head: false
37
+ use_k_shift: false
38
+ use_v_shift: false
39
+ optimizer:
40
+ _target_: torch.optim.AdamW
41
+ lr: 0.001
42
+ betas:
43
+ - 0.9
44
+ - 0.95
45
+ weight_decay: 0.1
46
+ schedule:
47
+ _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule
48
+ init_value: 0.0
49
+ peak_value: 0.001
50
+ warmup_steps: 20971520
51
+ decay_steps: 2097152000
52
+ end_value: 0.0
53
+ datamodule:
54
+ _target_: forgetting_transformer.datamodule.npy.NpyDataModule
55
+ data_path: /workspace/forgetting-transformer/data
56
+ rank: ???
57
+ world_size: ???
58
+ train_batch_len: 2048
59
+ train_batch_size: 1024
60
+ train_num_workers: 0
61
+ eval_tokens: 2147483648
62
+ eval_batch_len: 2048
63
+ eval_local_batch_size: 1
64
+ eval_num_workers: 0
65
+ strategy:
66
+ _target_: lightning.fabric.strategies.SingleDeviceStrategy
67
+ device: cuda:0
68
+ exp: forgetting_gate_3_4_256
69
+ tag: forgetting_gate_3_4_256
70
+ seed: 42
71
+ hf_load_dir: null
72
+ hf_save_dir: null
73
+ hf_load_step: null
74
+ output_dir: /workspace/forgetting-transformer/forgetting_gate_3_4_256
75
+ data_dir: /workspace/forgetting-transformer/data
76
+ resume: false
77
+ fork_dir: null
78
+ fork_step: null
79
+ log_interval: 20971520
80
+ eval_interval: 41943040
81
+ final_eval: true
82
+ skip_eval: false
83
+ checkpoint_interval: 209715200
84
+ train_eval_interval: 104857600
85
+ checkpoint_keep_interval: 209715200
86
+ fabric:
87
+ devices: 1
88
+ precision: 16-mixed
89
+ train:
90
+ max_tokens: 2097152000
91
+ grad_acc_tokens: 32768
92
+ max_grad_norm: 1.0
93
+ gradient_checkpointing: true
94
+ bias_weight_decay: false
95
+ normalization_weight_decay: false
96
+ conv_weight_decay: true
97
+ eval:
98
+ min_val_length: 512
99
+ wandb:
100
+ project: forgetting-transformer
101
+ mode: online
102
+ log_dir: ./output/wandb
configuration_transformer.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Optional
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+
8
+ class TransformerConfig(PretrainedConfig):
9
+
10
+ model_type = 'transformer-project_fox'
11
+ keys_to_ignore_at_inference = ['past_key_values']
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size: int = 32000,
16
+ hidden_size: int = 2048,
17
+ hidden_ratio: Optional[int] = 4,
18
+ intermediate_size: Optional[int] = None,
19
+ num_hidden_layers: int = 24,
20
+ num_heads: int = 32,
21
+ num_kv_heads: int = None,
22
+ hidden_act: str = "swish",
23
+ window_size: Optional[int] = None,
24
+ max_position_embeddings: int = 2048,
25
+ initializer_range: float = 0.02,
26
+ elementwise_affine: Optional[bool] = True,
27
+ norm_eps: float = 1e-6,
28
+ use_cache: bool = True,
29
+ pad_token_id: int = None,
30
+ bos_token_id: int = 1,
31
+ eos_token_id: int = 2,
32
+ tie_word_embeddings: bool = False,
33
+ attention_bias: bool = False,
34
+ fuse_norm: bool = True,
35
+ fuse_cross_entropy: bool = True,
36
+ rope_base: float = 500000.0,
37
+ use_rope: bool = True,
38
+ **kwargs,
39
+ ):
40
+ self.vocab_size = vocab_size
41
+ self.hidden_size = hidden_size
42
+ self.hidden_ratio = hidden_ratio
43
+ self.intermediate_size = intermediate_size
44
+ self.num_hidden_layers = num_hidden_layers
45
+ self.num_heads = num_heads
46
+ self.num_kv_heads = num_kv_heads
47
+ self.window_size = window_size
48
+ self.max_position_embeddings = max_position_embeddings
49
+
50
+ self.hidden_act = hidden_act
51
+ self.initializer_range = initializer_range
52
+ self.elementwise_affine = elementwise_affine
53
+ self.norm_eps = norm_eps
54
+ self.use_cache = use_cache
55
+ self.attention_bias = attention_bias
56
+ self.fuse_cross_entropy = fuse_cross_entropy
57
+ self.fuse_norm = fuse_norm
58
+ self.rope_base = rope_base
59
+ self.use_rope = use_rope
60
+
61
+ super().__init__(
62
+ pad_token_id=pad_token_id,
63
+ bos_token_id=bos_token_id,
64
+ eos_token_id=eos_token_id,
65
+ tie_word_embeddings=tie_word_embeddings,
66
+ **kwargs,
67
+ )
decay_params.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _forward_module.model.embeddings.weight
2
+ _forward_module.model.layers.0.attn.q_proj.weight
3
+ _forward_module.model.layers.0.attn.k_proj.weight
4
+ _forward_module.model.layers.0.attn.v_proj.weight
5
+ _forward_module.model.layers.0.attn.o_proj.weight
6
+ _forward_module.model.layers.0.attn.fgate_proj.weight
7
+ _forward_module.model.layers.0.mlp.gate_proj.weight
8
+ _forward_module.model.layers.0.mlp.down_proj.weight
9
+ _forward_module.model.layers.1.attn.q_proj.weight
10
+ _forward_module.model.layers.1.attn.k_proj.weight
11
+ _forward_module.model.layers.1.attn.v_proj.weight
12
+ _forward_module.model.layers.1.attn.o_proj.weight
13
+ _forward_module.model.layers.1.attn.fgate_proj.weight
14
+ _forward_module.model.layers.1.mlp.gate_proj.weight
15
+ _forward_module.model.layers.1.mlp.down_proj.weight
16
+ _forward_module.model.layers.2.attn.q_proj.weight
17
+ _forward_module.model.layers.2.attn.k_proj.weight
18
+ _forward_module.model.layers.2.attn.v_proj.weight
19
+ _forward_module.model.layers.2.attn.o_proj.weight
20
+ _forward_module.model.layers.2.attn.fgate_proj.weight
21
+ _forward_module.model.layers.2.mlp.gate_proj.weight
22
+ _forward_module.model.layers.2.mlp.down_proj.weight
23
+ _forward_module.lm_head.weight
logs/2025-10-17_04-14-45.log ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-17 04:14:45][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/forgetting_gate_3_4_256`
2
+ [2025-10-17 04:14:45][train:375][INFO] Configuration:
3
+ [2025-10-17 04:14:45][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/config.yaml.
4
+ [2025-10-17 04:14:45][train:387][INFO] creating datamodule
5
+ [2025-10-17 04:14:45][train:419][INFO] creating model
6
+ [2025-10-17 04:14:46][train:440][INFO] creating optimizer
7
+ [2025-10-17 04:14:46][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-17 04:14:46][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-17 04:14:46][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-17 04:14:47][logger:288][INFO] wandb initialized. Run id: fjbe04st
11
+ [2025-10-17 04:14:47][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-17 04:14:47][logger:113][INFO] Setting up npz logger...
13
+ [2025-10-17 04:14:47][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
14
+ [2025-10-17 04:14:47][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
15
+ [2025-10-17 04:14:47][logger:171][INFO] [step: 0] [model_info/total_params: 28302604] [model_info/trainable_params: 28302604] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 15431692]
16
+ [2025-10-17 04:15:30][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:43] [ETA: 1:11:03] [loss: 9.743] [tokens/s: 534171.335] [batches/s: 0.255] [MFU: 0.000] [TFLOPS: 0.000]
17
+ [2025-10-17 04:16:09][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:22] [ETA: 1:07:08] [loss: 8.094] [tokens/s: 534992.468] [batches/s: 0.255] [MFU: 0.000] [TFLOPS: 0.000]
18
+ [2025-10-17 04:16:09][train:194][INFO] Running validation...
19
+ [2025-10-17 04:17:45][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 82.206] [val/train_update_time: 81.859] [val/loss: 7.980] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 95.938] [val/val_tokens_per_second: 426941.551] [val/loss_avg_len_2048: 7.980] [val/perplexity_len_2048: 2921.122] [val/loss_avg_len_1024: 7.978] [val/perplexity_len_1024: 2916.720] [val/loss_avg_len_512: 7.978] [val/perplexity_len_512: 2916.645]
20
+ [2025-10-17 04:18:24][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:03:37] [ETA: 1:57:05] [loss: 7.451] [tokens/s: 290198.826] [batches/s: 0.138] [MFU: 0.000] [TFLOPS: 0.000]
21
+ [2025-10-17 04:19:03][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:04:16] [ETA: 1:42:37] [loss: 7.117] [tokens/s: 328700.885] [batches/s: 0.157] [MFU: 0.000] [TFLOPS: 0.000]
22
+ [2025-10-17 04:19:03][train:194][INFO] Running validation...
23
+ [2025-10-17 04:20:38][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 256.552] [val/train_update_time: 159.959] [val/loss: 7.102] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 94.824] [val/val_tokens_per_second: 431959.681] [val/loss_avg_len_2048: 7.102] [val/perplexity_len_2048: 1214.686] [val/loss_avg_len_1024: 7.101] [val/perplexity_len_1024: 1213.226] [val/loss_avg_len_512: 7.102] [val/perplexity_len_512: 1214.125]
24
+ [2025-10-17 04:21:17][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:06:30] [ETA: 2:03:39] [loss: 6.883] [tokens/s: 268459.667] [batches/s: 0.128] [MFU: 0.000] [TFLOPS: 0.000]
25
+ [2025-10-17 04:21:17][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 390.505] [train_eval/train_update_time: 198.951] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.220] [train_eval/perplexity_len_2048: 3716.093] [train_eval/loss_avg_len_1024: 8.221] [train_eval/perplexity_len_1024: 3717.046] [train_eval/loss_avg_len_512: 8.220] [train_eval/perplexity_len_512: 3715.197]
26
+ [2025-10-17 04:21:56][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:07:09] [ETA: 1:52:11] [loss: 6.625] [tokens/s: 293264.846] [batches/s: 0.140] [MFU: 0.000] [TFLOPS: 0.000]
27
+ [2025-10-17 04:21:56][train:194][INFO] Running validation...
28
+ [2025-10-17 04:23:31][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 429.639] [val/train_update_time: 237.942] [val/loss: 6.616] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 95.019] [val/val_tokens_per_second: 431072.567] [val/loss_avg_len_2048: 6.616] [val/perplexity_len_2048: 747.289] [val/loss_avg_len_1024: 6.616] [val/perplexity_len_1024: 746.927] [val/loss_avg_len_512: 6.619] [val/perplexity_len_512: 748.892]
29
+ [2025-10-17 04:24:10][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:09:23] [ETA: 2:04:51] [loss: 6.431] [tokens/s: 260182.953] [batches/s: 0.124] [MFU: 0.000] [TFLOPS: 0.000]
30
+ [2025-10-17 04:24:50][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:10:03] [ETA: 1:55:34] [loss: 6.250] [tokens/s: 278307.011] [batches/s: 0.133] [MFU: 0.000] [TFLOPS: 0.000]
31
+ [2025-10-17 04:24:50][train:194][INFO] Running validation...
32
+ [2025-10-17 04:28:25][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 603.023] [val/train_update_time: 316.033] [val/loss: 6.220] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 215.606] [val/val_tokens_per_second: 189975.865] [val/loss_avg_len_2048: 6.220] [val/perplexity_len_2048: 502.820] [val/loss_avg_len_1024: 6.221] [val/perplexity_len_1024: 503.087] [val/loss_avg_len_512: 6.225] [val/perplexity_len_512: 505.341]
33
+ [2025-10-17 04:29:30][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:14:43] [ETA: 2:28:52] [loss: 6.052] [tokens/s: 213126.762] [batches/s: 0.102] [MFU: 0.000] [TFLOPS: 0.000]
34
+ [2025-10-17 04:30:35][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:15:48] [ETA: 2:22:15] [loss: 5.914] [tokens/s: 220704.324] [batches/s: 0.105] [MFU: 0.000] [TFLOPS: 0.000]
35
+ [2025-10-17 04:30:35][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 948.433] [train_eval/train_update_time: 445.514] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.350] [train_eval/perplexity_len_2048: 572.373] [train_eval/loss_avg_len_1024: 6.352] [train_eval/perplexity_len_1024: 573.663] [train_eval/loss_avg_len_512: 6.355] [train_eval/perplexity_len_512: 575.392]
36
+ [2025-10-17 04:30:35][train:194][INFO] Running validation...
37
+ [2025-10-17 04:33:51][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 948.433] [val/train_update_time: 445.514] [val/loss: 5.898] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 196.365] [val/val_tokens_per_second: 208590.805] [val/loss_avg_len_2048: 5.898] [val/perplexity_len_2048: 364.305] [val/loss_avg_len_1024: 5.900] [val/perplexity_len_1024: 365.105] [val/loss_avg_len_512: 5.908] [val/perplexity_len_512: 367.811]
38
+ [2025-10-17 04:33:51][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000209715200.pt...
39
+ [2025-10-17 04:33:52][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000209715200.pt.
40
+ [2025-10-17 04:33:52][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.651]
41
+ [2025-10-17 04:35:27][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:20:40] [ETA: 2:47:18] [loss: 5.785] [tokens/s: 173926.196] [batches/s: 0.083] [MFU: 0.000] [TFLOPS: 0.000]
42
+ [2025-10-17 04:36:34][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:21:47] [ETA: 2:39:47] [loss: 5.628] [tokens/s: 184493.635] [batches/s: 0.088] [MFU: 0.000] [TFLOPS: 0.000]
43
+ [2025-10-17 04:36:34][train:194][INFO] Running validation...
44
+ [2025-10-17 04:39:25][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1307.415] [val/train_update_time: 606.937] [val/loss: 5.643] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 171.192] [val/val_tokens_per_second: 239263.047] [val/loss_avg_len_2048: 5.643] [val/perplexity_len_2048: 282.419] [val/loss_avg_len_1024: 5.647] [val/perplexity_len_1024: 283.423] [val/loss_avg_len_512: 5.657] [val/perplexity_len_512: 286.229]
45
+ [2025-10-17 04:41:00][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:26:13] [ETA: 2:55:31] [loss: 5.574] [tokens/s: 153514.588] [batches/s: 0.073] [MFU: 0.000] [TFLOPS: 0.000]
46
+ [2025-10-17 04:42:36][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:27:48] [ETA: 2:50:52] [loss: 5.482] [tokens/s: 158046.213] [batches/s: 0.075] [MFU: 0.000] [TFLOPS: 0.000]
47
+ [2025-10-17 04:42:36][train:194][INFO] Running validation...
48
+ [2025-10-17 04:45:13][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1668.954] [val/train_update_time: 796.746] [val/loss: 5.456] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 157.572] [val/val_tokens_per_second: 259944.243] [val/loss_avg_len_2048: 5.456] [val/perplexity_len_2048: 234.075] [val/loss_avg_len_1024: 5.460] [val/perplexity_len_1024: 235.163] [val/loss_avg_len_512: 5.471] [val/perplexity_len_512: 237.807]
49
+ [2025-10-17 04:46:40][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:31:52] [ETA: 3:00:39] [loss: 5.357] [tokens/s: 136726.904] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
50
+ [2025-10-17 04:46:40][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1912.917] [train_eval/train_update_time: 882.795] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.614] [train_eval/perplexity_len_2048: 274.306] [train_eval/loss_avg_len_1024: 5.619] [train_eval/perplexity_len_1024: 275.488] [train_eval/loss_avg_len_512: 5.628] [train_eval/perplexity_len_512: 278.005]
51
+ [2025-10-17 04:48:15][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:33:28] [ETA: 2:55:42] [loss: 5.320] [tokens/s: 140323.225] [batches/s: 0.067] [MFU: 0.000] [TFLOPS: 0.000]
52
+ [2025-10-17 04:48:15][train:194][INFO] Running validation...
53
+ [2025-10-17 04:51:18][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 2008.152] [val/train_update_time: 977.683] [val/loss: 5.302] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 183.399] [val/val_tokens_per_second: 223338.321] [val/loss_avg_len_2048: 5.302] [val/perplexity_len_2048: 200.642] [val/loss_avg_len_1024: 5.307] [val/perplexity_len_1024: 201.798] [val/loss_avg_len_512: 5.320] [val/perplexity_len_512: 204.331]
54
+ [2025-10-17 04:52:23][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:37:36] [ETA: 3:03:37] [loss: 5.213] [tokens/s: 122940.052] [batches/s: 0.059] [MFU: 0.000] [TFLOPS: 0.000]
55
+ [2025-10-17 04:53:42][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:38:55] [ETA: 2:57:18] [loss: 5.182] [tokens/s: 137487.932] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000]
56
+ [2025-10-17 04:53:42][train:194][INFO] Running validation...
57
+ [2025-10-17 04:57:12][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 2335.217] [val/train_update_time: 1120.859] [val/loss: 5.167] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 210.146] [val/val_tokens_per_second: 194911.813] [val/loss_avg_len_2048: 5.167] [val/perplexity_len_2048: 175.367] [val/loss_avg_len_1024: 5.174] [val/perplexity_len_1024: 176.619] [val/loss_avg_len_512: 5.189] [val/perplexity_len_512: 179.210]
58
+ [2025-10-17 04:58:17][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:43:30] [ETA: 3:05:28] [loss: 5.153] [tokens/s: 120681.082] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
59
+ [2025-10-17 04:59:22][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:44:35] [ETA: 2:58:20] [loss: 5.061] [tokens/s: 136576.579] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
60
+ [2025-10-17 04:59:22][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2675.161] [train_eval/train_update_time: 1250.173] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.210] [train_eval/perplexity_len_2048: 183.138] [train_eval/loss_avg_len_1024: 5.218] [train_eval/perplexity_len_1024: 184.486] [train_eval/loss_avg_len_512: 5.231] [train_eval/perplexity_len_512: 186.902]
61
+ [2025-10-17 04:59:22][train:194][INFO] Running validation...
62
+ [2025-10-17 05:02:56][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 2675.161] [val/train_update_time: 1250.173] [val/loss: 5.062] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 214.481] [val/val_tokens_per_second: 190972.246] [val/loss_avg_len_2048: 5.062] [val/perplexity_len_2048: 157.829] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.110] [val/loss_avg_len_512: 5.086] [val/perplexity_len_512: 161.671]
63
+ [2025-10-17 05:02:56][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000419430400.pt...
64
+ [2025-10-17 05:02:57][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000419430400.pt.
65
+ [2025-10-17 05:02:57][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.685]
66
+ [2025-10-17 05:04:08][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:49:21] [ETA: 3:05:40] [loss: 5.009] [tokens/s: 121249.525] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
67
+ [2025-10-17 05:05:13][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:50:26] [ETA: 2:58:49] [loss: 4.983] [tokens/s: 134979.528] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
68
+ [2025-10-17 05:05:13][train:194][INFO] Running validation...
69
+ [2025-10-17 05:08:21][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 3026.331] [val/train_update_time: 1385.822] [val/loss: 4.979] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 188.472] [val/val_tokens_per_second: 217326.213] [val/loss_avg_len_2048: 4.979] [val/perplexity_len_2048: 145.300] [val/loss_avg_len_1024: 4.988] [val/perplexity_len_1024: 146.608] [val/loss_avg_len_512: 5.005] [val/perplexity_len_512: 149.155]
70
+ [2025-10-17 05:09:57][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:55:09] [ETA: 3:04:40] [loss: 4.917] [tokens/s: 120252.948] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
71
+ [2025-10-17 05:11:16][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:56:29] [ETA: 2:58:51] [loss: 4.865] [tokens/s: 133442.888] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
72
+ [2025-10-17 05:11:16][train:194][INFO] Running validation...
73
+ [2025-10-17 05:13:58][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 3389.031] [val/train_update_time: 1559.516] [val/loss: 4.888] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 162.879] [val/val_tokens_per_second: 251475.001] [val/loss_avg_len_2048: 4.888] [val/perplexity_len_2048: 132.639] [val/loss_avg_len_1024: 4.898] [val/perplexity_len_1024: 133.983] [val/loss_avg_len_512: 4.917] [val/perplexity_len_512: 136.597]
74
+ [2025-10-17 05:15:34][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 1:00:47] [ETA: 3:02:21] [loss: 4.856] [tokens/s: 120389.931] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
75
+ [2025-10-17 05:15:34][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3647.144] [train_eval/train_update_time: 1654.545] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.950] [train_eval/perplexity_len_2048: 141.183] [train_eval/loss_avg_len_1024: 4.958] [train_eval/perplexity_len_1024: 142.255] [train_eval/loss_avg_len_512: 4.974] [train_eval/perplexity_len_512: 144.566]
76
+ [2025-10-17 05:17:09][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 1:02:22] [ETA: 2:57:31] [loss: 4.811] [tokens/s: 134445.886] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
77
+ [2025-10-17 05:17:09][train:194][INFO] Running validation...
78
+ [2025-10-17 05:19:55][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 3742.489] [val/train_update_time: 1749.553] [val/loss: 4.812] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 166.071] [val/val_tokens_per_second: 246641.285] [val/loss_avg_len_2048: 4.812] [val/perplexity_len_2048: 122.969] [val/loss_avg_len_1024: 4.823] [val/perplexity_len_1024: 124.318] [val/loss_avg_len_512: 4.844] [val/perplexity_len_512: 126.921]
79
+ [2025-10-17 05:21:09][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 1:06:22] [ETA: 2:59:27] [loss: 4.787] [tokens/s: 120747.283] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
80
+ [2025-10-17 05:22:45][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 1:07:57] [ETA: 2:54:46] [loss: 4.738] [tokens/s: 136059.167] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
81
+ [2025-10-17 05:22:45][train:194][INFO] Running validation...
82
+ [2025-10-17 05:25:57][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 4077.975] [val/train_update_time: 1918.300] [val/loss: 4.748] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 192.177] [val/val_tokens_per_second: 213136.563] [val/loss_avg_len_2048: 4.748] [val/perplexity_len_2048: 115.372] [val/loss_avg_len_1024: 4.760] [val/perplexity_len_1024: 116.789] [val/loss_avg_len_512: 4.783] [val/perplexity_len_512: 119.486]
83
+ [2025-10-17 05:27:02][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 1:12:15] [ETA: 2:56:53] [loss: 4.703] [tokens/s: 120834.339] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
84
+ [2025-10-17 05:28:07][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 1:13:20] [ETA: 2:51:07] [loss: 4.673] [tokens/s: 138352.163] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000]
85
+ [2025-10-17 05:28:07][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4400.525] [train_eval/train_update_time: 2048.153] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.769] [train_eval/perplexity_len_2048: 117.778] [train_eval/loss_avg_len_1024: 4.777] [train_eval/perplexity_len_1024: 118.804] [train_eval/loss_avg_len_512: 4.797] [train_eval/perplexity_len_512: 121.197]
86
+ [2025-10-17 05:28:07][train:194][INFO] Running validation...
87
+ [2025-10-17 05:31:47][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 4400.525] [val/train_update_time: 2048.153] [val/loss: 4.688] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 219.413] [val/val_tokens_per_second: 186679.681] [val/loss_avg_len_2048: 4.688] [val/perplexity_len_2048: 108.602] [val/loss_avg_len_1024: 4.701] [val/perplexity_len_1024: 110.063] [val/loss_avg_len_512: 4.726] [val/perplexity_len_512: 112.865]
88
+ [2025-10-17 05:31:47][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000629145600.pt...
89
+ [2025-10-17 05:31:47][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000629145600.pt.
90
+ [2025-10-17 05:31:47][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.716]
91
+ [2025-10-17 05:32:52][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 1:18:05] [ETA: 2:53:49] [loss: 4.687] [tokens/s: 120863.002] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
92
+ [2025-10-17 05:33:57][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 1:19:10] [ETA: 2:48:14] [loss: 4.612] [tokens/s: 136033.638] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
93
+ [2025-10-17 05:33:57][train:194][INFO] Running validation...
94
+ [2025-10-17 05:37:23][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 4750.569] [val/train_update_time: 2177.568] [val/loss: 4.633] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 205.877] [val/val_tokens_per_second: 198953.763] [val/loss_avg_len_2048: 4.633] [val/perplexity_len_2048: 102.780] [val/loss_avg_len_1024: 4.648] [val/perplexity_len_1024: 104.342] [val/loss_avg_len_512: 4.676] [val/perplexity_len_512: 107.297]
95
+ [2025-10-17 05:38:47][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:24:00] [ETA: 2:50:33] [loss: 4.627] [tokens/s: 120649.915] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
96
+ [2025-10-17 05:39:52][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:25:05] [ETA: 2:45:10] [loss: 4.575] [tokens/s: 134481.896] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
97
+ [2025-10-17 05:39:52][train:194][INFO] Running validation...
98
+ [2025-10-17 05:42:52][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 5105.305] [val/train_update_time: 2326.036] [val/loss: 4.573] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 180.139] [val/val_tokens_per_second: 227379.567] [val/loss_avg_len_2048: 4.573] [val/perplexity_len_2048: 96.872] [val/loss_avg_len_1024: 4.590] [val/perplexity_len_1024: 98.513] [val/loss_avg_len_512: 4.621] [val/perplexity_len_512: 101.577]
99
+ [2025-10-17 05:44:27][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:29:40] [ETA: 2:46:31] [loss: 4.555] [tokens/s: 120453.572] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
100
+ [2025-10-17 05:44:27][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5380.307] [train_eval/train_update_time: 2420.688] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.617] [train_eval/perplexity_len_2048: 101.213] [train_eval/loss_avg_len_1024: 4.631] [train_eval/perplexity_len_1024: 102.649] [train_eval/loss_avg_len_512: 4.658] [train_eval/perplexity_len_512: 105.424]
101
+ [2025-10-17 05:45:58][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:31:11] [ETA: 2:42:07] [loss: 4.479] [tokens/s: 133393.639] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
102
+ [2025-10-17 05:45:58][train:194][INFO] Running validation...
103
+ [2025-10-17 05:48:33][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 5471.658] [val/train_update_time: 2511.686] [val/loss: 4.521] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 154.917] [val/val_tokens_per_second: 264399.133] [val/loss_avg_len_2048: 4.521] [val/perplexity_len_2048: 91.928] [val/loss_avg_len_1024: 4.540] [val/perplexity_len_1024: 93.667] [val/loss_avg_len_512: 4.574] [val/perplexity_len_512: 96.890]
104
+ [2025-10-17 05:50:08][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:35:21] [ETA: 2:42:22] [loss: 4.493] [tokens/s: 120043.262] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
105
+ [2025-10-17 05:51:44][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:36:56] [ETA: 2:38:10] [loss: 4.466] [tokens/s: 134802.794] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
106
+ [2025-10-17 05:51:44][train:194][INFO] Running validation...
107
+ [2025-10-17 05:54:38][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 5816.984] [val/train_update_time: 2701.530] [val/loss: 4.476] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 174.703] [val/val_tokens_per_second: 234455.018] [val/loss_avg_len_2048: 4.476] [val/perplexity_len_2048: 87.859] [val/loss_avg_len_1024: 4.497] [val/perplexity_len_1024: 89.777] [val/loss_avg_len_512: 4.536] [val/perplexity_len_512: 93.313]
108
+ [2025-10-17 05:55:43][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:40:56] [ETA: 2:37:53] [loss: 4.453] [tokens/s: 121057.592] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
109
+ [2025-10-17 05:57:15][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:42:28] [ETA: 2:33:43] [loss: 4.344] [tokens/s: 136457.973] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
110
+ [2025-10-17 05:57:15][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6148.815] [train_eval/train_update_time: 2858.119] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.482] [train_eval/perplexity_len_2048: 88.404] [train_eval/loss_avg_len_1024: 4.500] [train_eval/perplexity_len_1024: 90.001] [train_eval/loss_avg_len_512: 4.536] [train_eval/perplexity_len_512: 93.285]
111
+ [2025-10-17 05:57:16][train:194][INFO] Running validation...
112
+ [2025-10-17 06:00:36][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 6148.815] [val/train_update_time: 2858.119] [val/loss: 4.414] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 200.308] [val/val_tokens_per_second: 204484.994] [val/loss_avg_len_2048: 4.414] [val/perplexity_len_2048: 82.596] [val/loss_avg_len_1024: 4.439] [val/perplexity_len_1024: 84.686] [val/loss_avg_len_512: 4.482] [val/perplexity_len_512: 88.437]
113
+ [2025-10-17 06:00:36][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000838860800.pt...
114
+ [2025-10-17 06:00:36][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000000838860800.pt.
115
+ [2025-10-17 06:00:36][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.672]
116
+ [2025-10-17 06:01:41][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:46:54] [ETA: 2:33:51] [loss: 4.359] [tokens/s: 120516.344] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
117
+ [2025-10-17 06:02:46][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:47:59] [ETA: 2:29:08] [loss: 4.361] [tokens/s: 137146.832] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
118
+ [2025-10-17 06:02:46][train:194][INFO] Running validation...
119
+ [2025-10-17 06:06:26][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 6479.876] [val/train_update_time: 2987.694] [val/loss: 4.364] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 219.935] [val/val_tokens_per_second: 186236.569] [val/loss_avg_len_2048: 4.364] [val/perplexity_len_2048: 78.573] [val/loss_avg_len_1024: 4.393] [val/perplexity_len_1024: 80.858] [val/loss_avg_len_512: 4.442] [val/perplexity_len_512: 84.914]
120
+ [2025-10-17 06:07:31][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:52:44] [ETA: 2:29:27] [loss: 4.334] [tokens/s: 120859.284] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
121
+ [2025-10-17 06:08:36][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:53:49] [ETA: 2:24:52] [loss: 4.344] [tokens/s: 135277.044] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
122
+ [2025-10-17 06:08:36][train:194][INFO] Running validation...
123
+ [2025-10-17 06:11:54][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 6829.789] [val/train_update_time: 3117.297] [val/loss: 4.307] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 197.568] [val/val_tokens_per_second: 207320.935] [val/loss_avg_len_2048: 4.307] [val/perplexity_len_2048: 74.212] [val/loss_avg_len_1024: 4.339] [val/perplexity_len_1024: 76.668] [val/loss_avg_len_512: 4.394] [val/perplexity_len_512: 80.955]
124
+ [2025-10-17 06:13:29][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:58:42] [ETA: 2:25:05] [loss: 4.278] [tokens/s: 119820.303] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
125
+ [2025-10-17 06:13:29][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7122.764] [train_eval/train_update_time: 3212.485] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.346] [train_eval/perplexity_len_2048: 77.162] [train_eval/loss_avg_len_1024: 4.375] [train_eval/perplexity_len_1024: 79.403] [train_eval/loss_avg_len_512: 4.425] [train_eval/perplexity_len_512: 83.502]
126
+ [2025-10-17 06:14:35][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:59:48] [ETA: 2:20:38] [loss: 4.238] [tokens/s: 133763.712] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
127
+ [2025-10-17 06:14:35][train:194][INFO] Running validation...
128
+ [2025-10-17 06:17:27][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 7188.302] [val/train_update_time: 3277.706] [val/loss: 4.256] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 171.915] [val/val_tokens_per_second: 238256.693] [val/loss_avg_len_2048: 4.256] [val/perplexity_len_2048: 70.561] [val/loss_avg_len_1024: 4.293] [val/perplexity_len_1024: 73.168] [val/loss_avg_len_512: 4.352] [val/perplexity_len_512: 77.643]
129
+ [2025-10-17 06:19:02][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 2:04:15] [ETA: 2:20:07] [loss: 4.236] [tokens/s: 120441.183] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
130
+ [2025-10-17 06:20:37][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 2:05:50] [ETA: 2:16:20] [loss: 4.213] [tokens/s: 133738.038] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
131
+ [2025-10-17 06:20:37][train:194][INFO] Running validation...
132
+ [2025-10-17 06:23:15][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 7550.796] [val/train_update_time: 3467.748] [val/loss: 4.212] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 157.039] [val/val_tokens_per_second: 260826.386] [val/loss_avg_len_2048: 4.212] [val/perplexity_len_2048: 67.496] [val/loss_avg_len_1024: 4.252] [val/perplexity_len_1024: 70.231] [val/loss_avg_len_512: 4.316] [val/perplexity_len_512: 74.899]
133
+ [2025-10-17 06:24:42][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 2:09:55] [ETA: 2:15:13] [loss: 4.206] [tokens/s: 119863.070] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
134
+ [2025-10-17 06:26:17][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 2:11:30] [ETA: 2:11:30] [loss: 4.168] [tokens/s: 135336.403] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
135
+ [2025-10-17 06:26:17][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7890.565] [train_eval/train_update_time: 3649.793] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.224] [train_eval/perplexity_len_2048: 68.336] [train_eval/loss_avg_len_1024: 4.260] [train_eval/perplexity_len_1024: 70.788] [train_eval/loss_avg_len_512: 4.322] [train_eval/perplexity_len_512: 75.312]
136
+ [2025-10-17 06:26:17][train:194][INFO] Running validation...
137
+ [2025-10-17 06:29:20][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 7890.565] [val/train_update_time: 3649.793] [val/loss: 4.171] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 182.648] [val/val_tokens_per_second: 224255.885] [val/loss_avg_len_2048: 4.171] [val/perplexity_len_2048: 64.798] [val/loss_avg_len_1024: 4.214] [val/perplexity_len_1024: 67.614] [val/loss_avg_len_512: 4.282] [val/perplexity_len_512: 72.367]
138
+ [2025-10-17 06:29:20][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001048576000.pt...
139
+ [2025-10-17 06:29:21][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001048576000.pt.
140
+ [2025-10-17 06:29:21][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.680]
141
+ [2025-10-17 06:30:26][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 2:15:38] [ETA: 2:10:19] [loss: 4.145] [tokens/s: 120882.605] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
142
+ [2025-10-17 06:31:46][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 2:16:59] [ETA: 2:06:27] [loss: 4.137] [tokens/s: 137223.033] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
143
+ [2025-10-17 06:31:46][train:194][INFO] Running validation...
144
+ [2025-10-17 06:35:15][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 8219.337] [val/train_update_time: 3794.716] [val/loss: 4.135] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 208.714] [val/val_tokens_per_second: 196249.735] [val/loss_avg_len_2048: 4.135] [val/perplexity_len_2048: 62.461] [val/loss_avg_len_1024: 4.180] [val/perplexity_len_1024: 65.355] [val/loss_avg_len_512: 4.251] [val/perplexity_len_512: 70.156]
145
+ [2025-10-17 06:36:20][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 2:21:33] [ETA: 2:05:31] [loss: 4.102] [tokens/s: 120583.902] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
146
+ [2025-10-17 06:37:24][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 2:22:37] [ETA: 2:01:30] [loss: 4.114] [tokens/s: 136497.581] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
147
+ [2025-10-17 06:37:24][train:194][INFO] Running validation...
148
+ [2025-10-17 06:41:02][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 8557.932] [val/train_update_time: 3924.080] [val/loss: 4.104] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 217.379] [val/val_tokens_per_second: 188426.823] [val/loss_avg_len_2048: 4.104] [val/perplexity_len_2048: 60.567] [val/loss_avg_len_1024: 4.152] [val/perplexity_len_1024: 63.552] [val/loss_avg_len_512: 4.226] [val/perplexity_len_512: 68.442]
149
+ [2025-10-17 06:42:10][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 2:27:23] [ETA: 2:00:35] [loss: 4.054] [tokens/s: 121146.231] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
150
+ [2025-10-17 06:42:10][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8843.699] [train_eval/train_update_time: 3992.285] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.124] [train_eval/perplexity_len_2048: 61.780] [train_eval/loss_avg_len_1024: 4.165] [train_eval/perplexity_len_1024: 64.369] [train_eval/loss_avg_len_512: 4.234] [train_eval/perplexity_len_512: 68.964]
151
+ [2025-10-17 06:43:15][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 2:28:28] [ETA: 1:56:39] [loss: 4.094] [tokens/s: 134928.714] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
152
+ [2025-10-17 06:43:15][train:194][INFO] Running validation...
153
+ [2025-10-17 06:46:23][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 8908.473] [val/train_update_time: 4056.886] [val/loss: 4.073] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 188.460] [val/val_tokens_per_second: 217340.218] [val/loss_avg_len_2048: 4.073] [val/perplexity_len_2048: 58.746] [val/loss_avg_len_1024: 4.123] [val/perplexity_len_1024: 61.742] [val/loss_avg_len_512: 4.199] [val/perplexity_len_512: 66.645]
154
+ [2025-10-17 06:47:59][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 2:33:12] [ETA: 1:55:34] [loss: 4.023] [tokens/s: 120214.360] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
155
+ [2025-10-17 06:49:17][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 2:34:30] [ETA: 1:51:52] [loss: 4.076] [tokens/s: 133449.150] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
156
+ [2025-10-17 06:49:17][train:194][INFO] Running validation...
157
+ [2025-10-17 06:52:00][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 9270.280] [val/train_update_time: 4229.695] [val/loss: 4.050] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 163.087] [val/val_tokens_per_second: 251154.915] [val/loss_avg_len_2048: 4.050] [val/perplexity_len_2048: 57.422] [val/loss_avg_len_1024: 4.102] [val/perplexity_len_1024: 60.463] [val/loss_avg_len_512: 4.181] [val/perplexity_len_512: 65.422]
158
+ [2025-10-17 06:53:36][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 2:38:48] [ETA: 1:50:21] [loss: 4.074] [tokens/s: 120446.907] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
159
+ [2025-10-17 06:55:11][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 2:40:24] [ETA: 1:46:56] [loss: 4.071] [tokens/s: 134512.277] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
160
+ [2025-10-17 06:55:11][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9624.041] [train_eval/train_update_time: 4419.824] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.052] [train_eval/perplexity_len_2048: 57.487] [train_eval/loss_avg_len_1024: 4.096] [train_eval/perplexity_len_1024: 60.074] [train_eval/loss_avg_len_512: 4.172] [train_eval/perplexity_len_512: 64.849]
161
+ [2025-10-17 06:55:11][train:194][INFO] Running validation...
162
+ [2025-10-17 06:57:56][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 9624.041] [val/train_update_time: 4419.824] [val/loss: 4.028] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 165.343] [val/val_tokens_per_second: 247728.158] [val/loss_avg_len_2048: 4.028] [val/perplexity_len_2048: 56.121] [val/loss_avg_len_1024: 4.080] [val/perplexity_len_1024: 59.168] [val/loss_avg_len_512: 4.161] [val/perplexity_len_512: 64.109]
163
+ [2025-10-17 06:57:56][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001258291200.pt...
164
+ [2025-10-17 06:57:57][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001258291200.pt.
165
+ [2025-10-17 06:57:57][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.683]
166
+ [2025-10-17 06:59:12][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 2:44:25] [ETA: 1:45:07] [loss: 4.030] [tokens/s: 120702.201] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
167
+ [2025-10-17 07:00:48][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 2:46:00] [ETA: 1:41:45] [loss: 4.003] [tokens/s: 136034.377] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
168
+ [2025-10-17 07:00:48][train:194][INFO] Running validation...
169
+ [2025-10-17 07:03:59][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 9960.902] [val/train_update_time: 4590.094] [val/loss: 4.009] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 190.972] [val/val_tokens_per_second: 214482.265] [val/loss_avg_len_2048: 4.009] [val/perplexity_len_2048: 55.105] [val/loss_avg_len_1024: 4.064] [val/perplexity_len_1024: 58.196] [val/loss_avg_len_512: 4.146] [val/perplexity_len_512: 63.162]
170
+ [2025-10-17 07:05:04][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 2:50:17] [ETA: 1:40:00] [loss: 4.015] [tokens/s: 120891.600] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
171
+ [2025-10-17 07:06:11][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:51:24] [ETA: 1:36:25] [loss: 4.002] [tokens/s: 138453.619] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000]
172
+ [2025-10-17 07:06:11][train:194][INFO] Running validation...
173
+ [2025-10-17 07:09:49][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 10284.480] [val/train_update_time: 4722.195] [val/loss: 3.991] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 218.068] [val/val_tokens_per_second: 187831.641] [val/loss_avg_len_2048: 3.991] [val/perplexity_len_2048: 54.104] [val/loss_avg_len_1024: 4.046] [val/perplexity_len_1024: 57.160] [val/loss_avg_len_512: 4.128] [val/perplexity_len_512: 62.075]
174
+ [2025-10-17 07:10:54][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:56:07] [ETA: 1:34:50] [loss: 3.970] [tokens/s: 120882.743] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
175
+ [2025-10-17 07:10:54][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10567.733] [train_eval/train_update_time: 4787.057] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.003] [train_eval/perplexity_len_2048: 54.771] [train_eval/loss_avg_len_1024: 4.055] [train_eval/perplexity_len_1024: 57.694] [train_eval/loss_avg_len_512: 4.135] [train_eval/perplexity_len_512: 62.496]
176
+ [2025-10-17 07:11:59][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:57:12] [ETA: 1:31:17] [loss: 4.003] [tokens/s: 136042.715] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
177
+ [2025-10-17 07:11:59][train:194][INFO] Running validation...
178
+ [2025-10-17 07:15:28][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 10632.617] [val/train_update_time: 4851.760] [val/loss: 3.976] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 208.425] [val/val_tokens_per_second: 196521.711] [val/loss_avg_len_2048: 3.976] [val/perplexity_len_2048: 53.302] [val/loss_avg_len_1024: 4.032] [val/perplexity_len_1024: 56.394] [val/loss_avg_len_512: 4.117] [val/perplexity_len_512: 61.348]
179
+ [2025-10-17 07:16:49][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 3:02:02] [ETA: 1:29:39] [loss: 3.940] [tokens/s: 120687.369] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
180
+ [2025-10-17 07:17:54][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 3:03:07] [ETA: 1:26:10] [loss: 3.958] [tokens/s: 134462.599] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
181
+ [2025-10-17 07:17:54][train:194][INFO] Running validation...
182
+ [2025-10-17 07:20:54][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 10987.034] [val/train_update_time: 4997.368] [val/loss: 3.962] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 180.259] [val/val_tokens_per_second: 227229.183] [val/loss_avg_len_2048: 3.962] [val/perplexity_len_2048: 52.545] [val/loss_avg_len_1024: 4.019] [val/perplexity_len_1024: 55.629] [val/loss_avg_len_512: 4.104] [val/perplexity_len_512: 60.559]
183
+ [2025-10-17 07:22:29][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 3:07:42] [ETA: 1:24:19] [loss: 3.969] [tokens/s: 120421.423] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
184
+ [2025-10-17 07:23:59][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 3:09:12] [ETA: 1:21:05] [loss: 3.962] [tokens/s: 133451.100] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
185
+ [2025-10-17 07:23:59][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 11352.441] [train_eval/train_update_time: 5181.949] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.962] [train_eval/perplexity_len_2048: 52.539] [train_eval/loss_avg_len_1024: 4.015] [train_eval/perplexity_len_1024: 55.445] [train_eval/loss_avg_len_512: 4.099] [train_eval/perplexity_len_512: 60.291]
186
+ [2025-10-17 07:23:59][train:194][INFO] Running validation...
187
+ [2025-10-17 07:26:34][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 11352.441] [val/train_update_time: 5181.949] [val/loss: 3.950] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 154.894] [val/val_tokens_per_second: 264438.196] [val/loss_avg_len_2048: 3.950] [val/perplexity_len_2048: 51.918] [val/loss_avg_len_1024: 4.007] [val/perplexity_len_1024: 54.988] [val/loss_avg_len_512: 4.092] [val/perplexity_len_512: 59.884]
188
+ [2025-10-17 07:26:34][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001468006400.pt...
189
+ [2025-10-17 07:26:35][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001468006400.pt.
190
+ [2025-10-17 07:26:35][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.688]
191
+ [2025-10-17 07:28:10][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 3:13:23] [ETA: 1:18:59] [loss: 3.967] [tokens/s: 120149.332] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
192
+ [2025-10-17 07:29:45][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 3:14:58] [ETA: 1:15:49] [loss: 3.935] [tokens/s: 134828.206] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
193
+ [2025-10-17 07:29:45][train:194][INFO] Running validation...
194
+ [2025-10-17 07:32:38][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 11698.410] [val/train_update_time: 5371.778] [val/loss: 3.939] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 172.710] [val/val_tokens_per_second: 237160.163] [val/loss_avg_len_2048: 3.939] [val/perplexity_len_2048: 51.353] [val/loss_avg_len_1024: 3.997] [val/perplexity_len_1024: 54.426] [val/loss_avg_len_512: 4.083] [val/perplexity_len_512: 59.323]
195
+ [2025-10-17 07:33:43][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 3:18:56] [ETA: 1:13:34] [loss: 3.958] [tokens/s: 121233.055] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
196
+ [2025-10-17 07:35:17][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 3:20:29] [ETA: 1:10:26] [loss: 3.928] [tokens/s: 136536.416] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
197
+ [2025-10-17 07:35:17][train:194][INFO] Running validation...
198
+ [2025-10-17 07:38:36][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 12029.831] [val/train_update_time: 5529.962] [val/loss: 3.929] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 199.573] [val/val_tokens_per_second: 205237.806] [val/loss_avg_len_2048: 3.929] [val/perplexity_len_2048: 50.848] [val/loss_avg_len_1024: 3.987] [val/perplexity_len_1024: 53.919] [val/loss_avg_len_512: 4.074] [val/perplexity_len_512: 58.797]
199
+ [2025-10-17 07:39:41][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 3:24:54] [ETA: 1:08:18] [loss: 3.923] [tokens/s: 120683.229] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
200
+ [2025-10-17 07:39:41][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 12294.616] [train_eval/train_update_time: 5594.850] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.933] [train_eval/perplexity_len_2048: 51.048] [train_eval/loss_avg_len_1024: 3.988] [train_eval/perplexity_len_1024: 53.930] [train_eval/loss_avg_len_512: 4.073] [train_eval/perplexity_len_512: 58.753]
201
+ [2025-10-17 07:40:46][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 3:25:59] [ETA: 1:05:03] [loss: 3.881] [tokens/s: 137593.065] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000]
202
+ [2025-10-17 07:40:46][train:194][INFO] Running validation...
203
+ [2025-10-17 07:44:26][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 12359.526] [val/train_update_time: 5659.566] [val/loss: 3.921] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 219.836] [val/val_tokens_per_second: 186320.320] [val/loss_avg_len_2048: 3.921] [val/perplexity_len_2048: 50.445] [val/loss_avg_len_1024: 3.980] [val/perplexity_len_1024: 53.524] [val/loss_avg_len_512: 4.067] [val/perplexity_len_512: 58.405]
204
+ [2025-10-17 07:45:31][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 3:30:44] [ETA: 1:02:56] [loss: 3.951] [tokens/s: 121009.856] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
205
+ [2025-10-17 07:46:36][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 3:31:49] [ETA: 0:59:44] [loss: 3.888] [tokens/s: 135485.464] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
206
+ [2025-10-17 07:46:36][train:194][INFO] Running validation...
207
+ [2025-10-17 07:49:55][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 12709.197] [val/train_update_time: 5789.021] [val/loss: 3.914] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 199.469] [val/val_tokens_per_second: 205345.662] [val/loss_avg_len_2048: 3.914] [val/perplexity_len_2048: 50.078] [val/loss_avg_len_1024: 3.973] [val/perplexity_len_1024: 53.151] [val/loss_avg_len_512: 4.061] [val/perplexity_len_512: 58.021]
208
+ [2025-10-17 07:51:29][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 3:36:42] [ETA: 0:57:36] [loss: 3.905] [tokens/s: 119997.150] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
209
+ [2025-10-17 07:52:34][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 3:37:47] [ETA: 0:54:26] [loss: 3.888] [tokens/s: 133977.546] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
210
+ [2025-10-17 07:52:34][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 13067.235] [train_eval/train_update_time: 5947.200] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.913] [train_eval/perplexity_len_2048: 50.036] [train_eval/loss_avg_len_1024: 3.970] [train_eval/perplexity_len_1024: 52.994] [train_eval/loss_avg_len_512: 4.056] [train_eval/perplexity_len_512: 57.765]
211
+ [2025-10-17 07:52:34][train:194][INFO] Running validation...
212
+ [2025-10-17 07:55:26][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 13067.235] [val/train_update_time: 5947.200] [val/loss: 3.908] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 171.970] [val/val_tokens_per_second: 238181.535] [val/loss_avg_len_2048: 3.908] [val/perplexity_len_2048: 49.775] [val/loss_avg_len_1024: 3.967] [val/perplexity_len_1024: 52.835] [val/loss_avg_len_512: 4.055] [val/perplexity_len_512: 57.684]
213
+ [2025-10-17 07:55:26][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001677721600.pt...
214
+ [2025-10-17 07:55:26][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001677721600.pt.
215
+ [2025-10-17 07:55:26][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.700]
216
+ [2025-10-17 07:57:02][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 3:42:15] [ETA: 0:52:07] [loss: 3.867] [tokens/s: 120566.207] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
217
+ [2025-10-17 07:58:37][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 3:43:50] [ETA: 0:49:08] [loss: 3.877] [tokens/s: 133721.343] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
218
+ [2025-10-17 07:58:37][train:194][INFO] Running validation...
219
+ [2025-10-17 08:01:14][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 13430.399] [val/train_update_time: 6137.126] [val/loss: 3.902] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 156.546] [val/val_tokens_per_second: 261648.012] [val/loss_avg_len_2048: 3.902] [val/perplexity_len_2048: 49.519] [val/loss_avg_len_1024: 3.962] [val/perplexity_len_1024: 52.586] [val/loss_avg_len_512: 4.051] [val/perplexity_len_512: 57.429]
220
+ [2025-10-17 08:02:43][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 3:47:55] [ETA: 0:46:41] [loss: 3.914] [tokens/s: 119874.242] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
221
+ [2025-10-17 08:04:18][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 3:49:31] [ETA: 0:43:43] [loss: 3.850] [tokens/s: 135247.845] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
222
+ [2025-10-17 08:04:18][train:194][INFO] Running validation...
223
+ [2025-10-17 08:07:19][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 13771.156] [val/train_update_time: 6320.664] [val/loss: 3.898] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 181.487] [val/val_tokens_per_second: 225690.553] [val/loss_avg_len_2048: 3.898] [val/perplexity_len_2048: 49.294] [val/loss_avg_len_1024: 3.958] [val/perplexity_len_1024: 52.371] [val/loss_avg_len_512: 4.047] [val/perplexity_len_512: 57.225]
224
+ [2025-10-17 08:08:24][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 3:53:37] [ETA: 0:41:13] [loss: 3.926] [tokens/s: 120953.198] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
225
+ [2025-10-17 08:08:24][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 14017.666] [train_eval/train_update_time: 6385.368] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.892] [train_eval/perplexity_len_2048: 48.989] [train_eval/loss_avg_len_1024: 3.943] [train_eval/perplexity_len_1024: 51.588] [train_eval/loss_avg_len_512: 4.030] [train_eval/perplexity_len_512: 56.273]
226
+ [2025-10-17 08:09:46][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 3:54:59] [ETA: 0:38:15] [loss: 3.910] [tokens/s: 137199.106] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
227
+ [2025-10-17 08:09:46][train:194][INFO] Running validation...
228
+ [2025-10-17 08:13:14][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 14099.153] [val/train_update_time: 6466.660] [val/loss: 3.894] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 208.278] [val/val_tokens_per_second: 196659.979] [val/loss_avg_len_2048: 3.894] [val/perplexity_len_2048: 49.121] [val/loss_avg_len_1024: 3.955] [val/perplexity_len_1024: 52.191] [val/loss_avg_len_512: 4.044] [val/perplexity_len_512: 57.031]
229
+ [2025-10-17 08:14:19][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 3:59:32] [ETA: 0:35:47] [loss: 3.852] [tokens/s: 120594.118] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
230
+ [2025-10-17 08:15:24][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 4:00:37] [ETA: 0:32:48] [loss: 3.864] [tokens/s: 136656.993] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
231
+ [2025-10-17 08:15:24][train:194][INFO] Running validation...
232
+ [2025-10-17 08:19:02][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 14437.422] [val/train_update_time: 6596.150] [val/loss: 3.891] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 217.734] [val/val_tokens_per_second: 188119.265] [val/loss_avg_len_2048: 3.891] [val/perplexity_len_2048: 48.984] [val/loss_avg_len_1024: 3.952] [val/perplexity_len_1024: 52.052] [val/loss_avg_len_512: 4.041] [val/perplexity_len_512: 56.889]
233
+ [2025-10-17 08:20:09][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 4:05:22] [ETA: 0:30:19] [loss: 3.933] [tokens/s: 121142.884] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
234
+ [2025-10-17 08:21:14][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 4:06:27] [ETA: 0:27:23] [loss: 3.842] [tokens/s: 134980.044] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
235
+ [2025-10-17 08:21:14][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 14787.652] [train_eval/train_update_time: 6728.264] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.888] [train_eval/perplexity_len_2048: 48.796] [train_eval/loss_avg_len_1024: 3.943] [train_eval/perplexity_len_1024: 51.566] [train_eval/loss_avg_len_512: 4.030] [train_eval/perplexity_len_512: 56.286]
236
+ [2025-10-17 08:21:14][train:194][INFO] Running validation...
237
+ [2025-10-17 08:24:25][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 14787.652] [val/train_update_time: 6728.264] [val/loss: 3.889] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 190.866] [val/val_tokens_per_second: 214601.094] [val/loss_avg_len_2048: 3.889] [val/perplexity_len_2048: 48.877] [val/loss_avg_len_1024: 3.950] [val/perplexity_len_1024: 51.939] [val/loss_avg_len_512: 4.039] [val/perplexity_len_512: 56.767]
238
+ [2025-10-17 08:24:25][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001887436800.pt...
239
+ [2025-10-17 08:24:26][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_3_4_256/checkpoints/step-000001887436800.pt.
240
+ [2025-10-17 08:24:26][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.698]
241
+ [2025-10-17 08:26:01][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 4:11:14] [ETA: 0:24:50] [loss: 3.886] [tokens/s: 120025.584] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
242
+ [2025-10-17 08:27:16][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 4:12:29] [ETA: 0:21:57] [loss: 3.901] [tokens/s: 133418.077] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
243
+ [2025-10-17 08:27:16][train:194][INFO] Running validation...
244
+ [2025-10-17 08:30:00][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 15149.747] [val/train_update_time: 6898.239] [val/loss: 3.888] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 163.659] [val/val_tokens_per_second: 250276.495] [val/loss_avg_len_2048: 3.888] [val/perplexity_len_2048: 48.803] [val/loss_avg_len_1024: 3.949] [val/perplexity_len_1024: 51.865] [val/loss_avg_len_512: 4.038] [val/perplexity_len_512: 56.691]
245
+ [2025-10-17 08:31:35][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 4:16:48] [ETA: 0:19:19] [loss: 3.893] [tokens/s: 120493.603] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
246
+ [2025-10-17 08:33:10][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 4:18:23] [ETA: 0:16:29] [loss: 3.856] [tokens/s: 134423.613] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000]
247
+ [2025-10-17 08:33:10][train:194][INFO] Running validation...
248
+ [2025-10-17 08:35:55][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 15503.806] [val/train_update_time: 7088.071] [val/loss: 3.887] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 164.750] [val/val_tokens_per_second: 248618.649] [val/loss_avg_len_2048: 3.887] [val/perplexity_len_2048: 48.751] [val/loss_avg_len_1024: 3.948] [val/perplexity_len_1024: 51.811] [val/loss_avg_len_512: 4.037] [val/perplexity_len_512: 56.631]
249
+ [2025-10-17 08:37:12][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 4:22:24] [ETA: 0:13:48] [loss: 3.864] [tokens/s: 120658.642] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
250
+ [2025-10-17 08:37:12][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 15744.901] [train_eval/train_update_time: 7164.085] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.877] [train_eval/perplexity_len_2048: 48.260] [train_eval/loss_avg_len_1024: 3.935] [train_eval/perplexity_len_1024: 51.170] [train_eval/loss_avg_len_512: 4.022] [train_eval/perplexity_len_512: 55.813]
251
+ [2025-10-17 08:38:47][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 4:24:00] [ETA: 0:11:00] [loss: 3.865] [tokens/s: 136024.768] [batches/s: 0.065] [MFU: 0.000] [TFLOPS: 0.000]
252
+ [2025-10-17 08:38:47][train:194][INFO] Running validation...
253
+ [2025-10-17 08:41:58][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 15840.400] [val/train_update_time: 7259.235] [val/loss: 3.886] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 190.497] [val/val_tokens_per_second: 215016.751] [val/loss_avg_len_2048: 3.886] [val/perplexity_len_2048: 48.722] [val/loss_avg_len_1024: 3.947] [val/perplexity_len_1024: 51.784] [val/loss_avg_len_512: 4.036] [val/perplexity_len_512: 56.606]
254
+ [2025-10-17 08:43:02][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 4:28:15] [ETA: 0:08:17] [loss: 3.895] [tokens/s: 120923.145] [batches/s: 0.058] [MFU: 0.000] [TFLOPS: 0.000]
255
+ [2025-10-17 08:44:11][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 4:29:24] [ETA: 0:05:29] [loss: 3.868] [tokens/s: 138434.720] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000]
256
+ [2025-10-17 08:44:11][train:194][INFO] Running validation...
257
+ [2025-10-17 08:47:48][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 16164.017] [val/train_update_time: 7391.848] [val/loss: 3.886] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 217.613] [val/val_tokens_per_second: 188224.392] [val/loss_avg_len_2048: 3.886] [val/perplexity_len_2048: 48.709] [val/loss_avg_len_1024: 3.947] [val/perplexity_len_1024: 51.770] [val/loss_avg_len_512: 4.036] [val/perplexity_len_512: 56.592]
258
+ [2025-10-17 08:47:48][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 0.6505984560353681}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 0.6847445101011544}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 0.7160250260494649}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 0.6718022819841281}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 0.6803000940708444}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 0.6831371689913794}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 0.687845746986568}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 0.6998186240671203}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 0.6979976670118049}
metrics/jsonlines/model_info.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "model_info/total_params": 28302604, "model_info/trainable_params": 28302604, "model_info/embedding_params": 12870912, "model_info/flops_per_token": 0, "model_info/non_embedding_params": 15431692}
metrics/jsonlines/norm.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/resume.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "resume/resume_step": 0}
metrics/jsonlines/throughput.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 43.060654881061055, "train/update_time": 42.84856698883232, "train/lr": 0.0009000000000000001, "train/loss": 9.742526054382324, "train/global_grad_norm": 1.2064119577407837}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 82.20607728499454, "train/update_time": 81.85936604591552, "train/lr": 0.0009997960964140947, "train/loss": 8.093623161315918, "train/global_grad_norm": 0.9788451790809631}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 217.2982500520302, "train/update_time": 120.85350854764692, "train/lr": 0.0009990914580222257, "train/loss": 7.450771808624268, "train/global_grad_norm": 0.34447360038757324}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 256.55152114806697, "train/update_time": 159.95920802967157, "train/lr": 0.0009978842768382998, "train/loss": 7.1170334815979, "train/global_grad_norm": 0.256023108959198}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 390.50471019698307, "train/update_time": 198.95112687454093, "train/lr": 0.0009961757683914405, "train/loss": 6.883033275604248, "train/global_grad_norm": 0.3945038616657257}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 429.63876787002664, "train/update_time": 237.9422758014407, "train/lr": 0.00099396765300483, "train/loss": 6.625162124633789, "train/global_grad_norm": 0.37001708149909973}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 563.8872604409698, "train/update_time": 277.0238818913931, "train/lr": 0.0009912621540634887, "train/loss": 6.4313578605651855, "train/global_grad_norm": 0.3596683442592621}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 603.0225816690363, "train/update_time": 316.03332217247225, "train/lr": 0.000988061995775515, "train/loss": 6.250260353088379, "train/global_grad_norm": 0.5427169799804688}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 883.4803289039992, "train/update_time": 380.7255741544068, "train/lr": 0.0009843704004290394, "train/loss": 6.0518107414245605, "train/global_grad_norm": 0.43748435378074646}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 948.4334847090067, "train/update_time": 445.5143054035725, "train/lr": 0.0009801910851476522, "train/loss": 5.913501262664795, "train/global_grad_norm": 0.635853111743927}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1240.6968032299774, "train/update_time": 540.5405412946129, "train/lr": 0.0009755282581475768, "train/loss": 5.784536361694336, "train/global_grad_norm": 0.7734508514404297}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1307.4146166159771, "train/update_time": 606.9371437055524, "train/lr": 0.0009703866145003512, "train/loss": 5.628347873687744, "train/global_grad_norm": 0.3913114666938782}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1573.652350184042, "train/update_time": 701.7780046195257, "train/lr": 0.0009647713314052896, "train/loss": 5.573921203613281, "train/global_grad_norm": 0.6603513956069946}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1668.9536387879634, "train/update_time": 796.7456705565564, "train/lr": 0.0009586880629764817, "train/loss": 5.481806755065918, "train/global_grad_norm": 0.4060311019420624}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1912.9173058250453, "train/update_time": 882.7945512202568, "train/lr": 0.0009521429345495787, "train/loss": 5.357491970062256, "train/global_grad_norm": 0.5337135195732117}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 2008.152192622074, "train/update_time": 977.6825634960551, "train/lr": 0.0009451425365140996, "train/loss": 5.320371150970459, "train/global_grad_norm": 0.6065119504928589}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 2256.581265029032, "train/update_time": 1042.3986120790942, "train/lr": 0.000937693917677468, "train/loss": 5.213259696960449, "train/global_grad_norm": 0.5299382209777832}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 2335.2169849510537, "train/update_time": 1120.8591244373238, "train/lr": 0.0009298045781674596, "train/loss": 5.182425022125244, "train/global_grad_norm": 0.5129911303520203}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2610.4178057450335, "train/update_time": 1185.6043002393562, "train/lr": 0.0009214824618802108, "train/loss": 5.152604579925537, "train/global_grad_norm": 0.6535936594009399}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2675.1611660569906, "train/update_time": 1250.1728800584096, "train/lr": 0.000912735948481387, "train/loss": 5.060560703277588, "train/global_grad_norm": 0.5272890329360962}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2961.40797771106, "train/update_time": 1321.067315995344, "train/lr": 0.0009035738449685707, "train/loss": 5.009044170379639, "train/global_grad_norm": 0.5572112202644348}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 3026.331188999, "train/update_time": 1385.8219667483354, "train/lr": 0.0008940053768033609, "train/loss": 4.982542037963867, "train/global_grad_norm": 0.8324569463729858}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 3309.8893430039752, "train/update_time": 1480.7058881375706, "train/lr": 0.0008840401786221159, "train/loss": 4.917466640472412, "train/global_grad_norm": 0.4935317635536194}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 3389.030513621052, "train/update_time": 1559.5160488487454, "train/lr": 0.0008736882845346905, "train/loss": 4.865272045135498, "train/global_grad_norm": 0.5209423899650574}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 3647.1437564629596, "train/update_time": 1654.5446031297324, "train/lr": 0.0008629601180209381, "train/loss": 4.85551118850708, "train/global_grad_norm": 0.49810776114463806}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 3742.4888608519686, "train/update_time": 1749.5527433896204, "train/lr": 0.0008518664814351503, "train/loss": 4.8105621337890625, "train/global_grad_norm": 0.7073996663093567}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3982.554893324035, "train/update_time": 1823.2195517455693, "train/lr": 0.0008404185451290017, "train/loss": 4.787278175354004, "train/global_grad_norm": 0.47247517108917236}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 4077.975462540984, "train/update_time": 1918.29994701338, "train/lr": 0.0008286278362039527, "train/loss": 4.737539291381836, "train/global_grad_norm": 0.7282879948616028}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 4335.1425057089655, "train/update_time": 1982.9610430714674, "train/lr": 0.0008165062269044352, "train/loss": 4.703302383422852, "train/global_grad_norm": 0.5245164036750793}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 4400.52537979302, "train/update_time": 2048.153126588324, "train/lr": 0.0008040659226635089, "train/loss": 4.673452854156494, "train/global_grad_norm": 0.65628981590271}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 4685.726247773971, "train/update_time": 2112.8967634125147, "train/lr": 0.0007913194498130252, "train/loss": 4.68710470199585, "train/global_grad_norm": 0.46741145849227905}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 4750.569092386053, "train/update_time": 2177.56818451453, "train/lr": 0.000778279642970672, "train/loss": 4.611921310424805, "train/global_grad_norm": 0.573194146156311}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 5040.420318357064, "train/update_time": 2261.327930470463, "train/lr": 0.0007649596321166025, "train/loss": 4.6267900466918945, "train/global_grad_norm": 0.524635910987854}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 5105.305374011979, "train/update_time": 2326.03590223263, "train/lr": 0.0007513728293726579, "train/loss": 4.575213432312012, "train/global_grad_norm": 0.5117322206497192}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 5380.30665975099, "train/update_time": 2420.687880878686, "train/lr": 0.0007375329154974975, "train/loss": 4.554539203643799, "train/global_grad_norm": 0.5882130265235901}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 5471.657564450987, "train/update_time": 2511.68601841887, "train/lr": 0.0007234538261112341, "train/loss": 4.479025363922119, "train/global_grad_norm": 0.620008647441864}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 5721.779677586979, "train/update_time": 2606.663688934641, "train/lr": 0.0007091497376634464, "train/loss": 4.493178367614746, "train/global_grad_norm": 0.5966338515281677}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 5816.983929526061, "train/update_time": 2701.5302643649047, "train/lr": 0.0006946350531586958, "train/loss": 4.466251373291016, "train/global_grad_norm": 0.8487857580184937}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 6056.710990026011, "train/update_time": 2766.2267534437124, "train/lr": 0.0006799243876539214, "train/loss": 4.453250408172607, "train/global_grad_norm": 0.5597178339958191}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 6148.81527663907, "train/update_time": 2858.1190680917352, "train/lr": 0.0006650325535423166, "train/loss": 4.343719959259033, "train/global_grad_norm": 0.5185702443122864}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 6414.913468671963, "train/update_time": 2922.915099428734, "train/lr": 0.0006499745456385053, "train/loss": 4.359192371368408, "train/global_grad_norm": 0.5307362675666809}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 6479.875744563062, "train/update_time": 2987.694357936969, "train/lr": 0.0006347655260800339, "train/loss": 4.360560894012451, "train/global_grad_norm": 1.0372467041015625}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 6764.786123692058, "train/update_time": 3052.482261945028, "train/lr": 0.0006194208090603844, "train/loss": 4.333981990814209, "train/global_grad_norm": 0.49141013622283936}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 6829.788738840027, "train/update_time": 3117.297304859967, "train/lr": 0.0006039558454088796, "train/loss": 4.344142913818359, "train/global_grad_norm": 0.5465747117996216}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 7122.763824166963, "train/update_time": 3212.485016722814, "train/lr": 0.0005883862070330078, "train/loss": 4.2783589363098145, "train/global_grad_norm": 0.5123881101608276}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 7188.301673018024, "train/update_time": 3277.705760344048, "train/lr": 0.0005727275712388317, "train/loss": 4.238135814666748, "train/global_grad_norm": 0.6535881757736206}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 7455.309935454046, "train/update_time": 3372.5882673548767, "train/lr": 0.0005569957049452703, "train/loss": 4.2355146408081055, "train/global_grad_norm": 0.7211827635765076}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 7550.796140610008, "train/update_time": 3467.7480247288477, "train/lr": 0.0005412064488081482, "train/loss": 4.212989330291748, "train/global_grad_norm": 0.6699090003967285}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 7795.362897483981, "train/update_time": 3554.937167952885, "train/lr": 0.0005253757012699972, "train/loss": 4.206099033355713, "train/global_grad_norm": 0.5892317891120911}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 7890.565265796031, "train/update_time": 3649.793015780393, "train/lr": 0.0005095194025516734, "train/loss": 4.167843818664551, "train/global_grad_norm": 0.6354329586029053}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 8138.991956859012, "train/update_time": 3714.5668725935975, "train/lr": 0.0004936535186019053, "train/loss": 4.145269393920898, "train/global_grad_norm": 0.6342147588729858}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 8219.336814008071, "train/update_time": 3794.7164617268136, "train/lr": 0.00047779402502093696, "train/loss": 4.136696815490723, "train/global_grad_norm": 0.6562011241912842}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 8493.090819464996, "train/update_time": 3859.435854826006, "train/lr": 0.0004619568909744525, "train/loss": 4.1017913818359375, "train/global_grad_norm": 0.516792893409729}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 8557.932057484053, "train/update_time": 3924.0804488240974, "train/lr": 0.00044615806311398067, "train/loss": 4.113844871520996, "train/global_grad_norm": 0.7070318460464478}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 8843.699362242012, "train/update_time": 3992.2848253480624, "train/lr": 0.0004304134495199673, "train/loss": 4.053966999053955, "train/global_grad_norm": 0.6166961789131165}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 8908.472957947059, "train/update_time": 4056.885919503169, "train/lr": 0.0004147389036836882, "train/loss": 4.0940752029418945, "train/global_grad_norm": 0.4245615601539612}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 9192.107095155981, "train/update_time": 4151.851490042289, "train/lr": 0.0003991502085441259, "train/loss": 4.023417949676514, "train/global_grad_norm": 0.8481374382972717}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 9270.280366463005, "train/update_time": 4229.695196601329, "train/lr": 0.0003836630605958888, "train/loss": 4.07583475112915, "train/global_grad_norm": 0.6015454530715942}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 9528.824100063066, "train/update_time": 4324.942309651291, "train/lr": 0.00036829305408417155, "train/loss": 4.073826789855957, "train/global_grad_norm": 0.5151322484016418}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 9624.041267768014, "train/update_time": 4419.823843153426, "train/lr": 0.000353055665302672, "train/loss": 4.070618629455566, "train/global_grad_norm": 0.5760351419448853}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 9865.595853073057, "train/update_time": 4495.119489662466, "train/lr": 0.0003379662370102746, "train/loss": 4.029588222503662, "train/global_grad_norm": 0.5814235210418701}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 9960.90240954503, "train/update_time": 4590.094068991486, "train/lr": 0.00032303996298219405, "train/loss": 4.002981185913086, "train/global_grad_norm": 0.5200217366218567}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 10217.011305541033, "train/update_time": 4654.903231042321, "train/lr": 0.00030829187271113034, "train/loss": 4.015390872955322, "train/global_grad_norm": 0.4751983880996704}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 10284.48041301698, "train/update_time": 4722.194782613544, "train/lr": 0.0002937368162738445, "train/loss": 4.001778602600098, "train/global_grad_norm": 0.5075345635414124}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 10567.73318944103, "train/update_time": 4787.056737843552, "train/lr": 0.0002793894493783894, "train/loss": 3.9701263904571533, "train/global_grad_norm": 0.6821728944778442}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 10632.616937980056, "train/update_time": 4851.760493494687, "train/lr": 0.00026526421860705474, "train/loss": 4.002848148345947, "train/global_grad_norm": 0.4041655659675598}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 10922.120217734016, "train/update_time": 4932.641111404984, "train/lr": 0.0002513753468698824, "train/loss": 3.9402196407318115, "train/global_grad_norm": 0.4113616943359375}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 10987.034303960972, "train/update_time": 4997.368006910896, "train/lr": 0.00023773681908340283, "train/loss": 3.9576096534729004, "train/global_grad_norm": 0.48851215839385986}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 11262.41889945697, "train/update_time": 5092.274435500731, "train/lr": 0.00022436236808900823, "train/loss": 3.968796491622925, "train/global_grad_norm": 0.44083112478256226}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 11352.440668291994, "train/update_time": 5181.94938044867, "train/lr": 0.00021126546082514682, "train/loss": 3.9615118503570557, "train/global_grad_norm": 0.509088933467865}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 11603.308744725073, "train/update_time": 5277.016342979507, "train/lr": 0.00019845928476725522, "train/loss": 3.9665751457214355, "train/global_grad_norm": 0.4311610460281372}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 11698.40989001107, "train/update_time": 5371.777714943513, "train/lr": 0.0001859567346490913, "train/loss": 3.9346354007720947, "train/global_grad_norm": 0.36381229758262634}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 11936.098186857998, "train/update_time": 5436.443360454286, "train/lr": 0.00017377039947882782, "train/loss": 3.957932233810425, "train/global_grad_norm": 0.4258624017238617}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 12029.831404851982, "train/update_time": 5529.962416259223, "train/lr": 0.00016191254986299043, "train/loss": 3.928053379058838, "train/global_grad_norm": 0.4268289804458618}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 12294.615799573017, "train/update_time": 5594.8495038923575, "train/lr": 0.00015039512565099468, "train/loss": 3.922907829284668, "train/global_grad_norm": 0.36265134811401367}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 12359.526009666035, "train/update_time": 5659.566051078378, "train/lr": 0.00013922972391273224, "train/loss": 3.8805909156799316, "train/global_grad_norm": 0.36658549308776855}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 12644.348519637017, "train/update_time": 5724.354619655525, "train/lr": 0.00012842758726130281, "train/loss": 3.951104164123535, "train/global_grad_norm": 0.38930419087409973}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 12709.197320922045, "train/update_time": 5789.02060880058, "train/lr": 0.00011799959253265679, "train/loss": 3.8884987831115723, "train/global_grad_norm": 0.36397334933280945}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 13002.292042226996, "train/update_time": 5882.437751233345, "train/lr": 0.00010795623983354214, "train/loss": 3.9047420024871826, "train/global_grad_norm": 0.37097853422164917}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 13067.235377178993, "train/update_time": 5947.199882825487, "train/lr": 9.830764196878872e-05, "train/loss": 3.888375759124756, "train/global_grad_norm": 0.3208369314670563}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 13335.021598904976, "train/update_time": 6042.097219433286, "train/lr": 8.906351425856951e-05, "train/loss": 3.8670456409454346, "train/global_grad_norm": 0.35861602425575256}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 13430.399093109067, "train/update_time": 6137.126117701177, "train/lr": 8.02331647558977e-05, "train/loss": 3.8770906925201416, "train/global_grad_norm": 0.264740914106369}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 13675.972462569014, "train/update_time": 6225.810343745281, "train/lr": 7.182548487420554e-05, "train/loss": 3.9138388633728027, "train/global_grad_norm": 0.3208444118499756}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 13771.155967495986, "train/update_time": 6320.664257825236, "train/lr": 6.384894043444556e-05, "train/loss": 3.849836587905884, "train/global_grad_norm": 0.327322393655777}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 14017.66551920597, "train/update_time": 6385.36848655506, "train/lr": 5.6311563140726166e-05, "train/loss": 3.9262607097625732, "train/global_grad_norm": 0.3086305260658264}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 14099.15276560001, "train/update_time": 6466.659589443123, "train/lr": 4.922094249306547e-05, "train/loss": 3.909940242767334, "train/global_grad_norm": 0.25480327010154724}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 14372.461564740981, "train/update_time": 6531.371863001143, "train/lr": 4.2584218145409916e-05, "train/loss": 3.8515701293945312, "train/global_grad_norm": 0.26658812165260315}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 14437.422251214972, "train/update_time": 6596.149574242183, "train/lr": 3.6408072716606236e-05, "train/loss": 3.8637194633483887, "train/global_grad_norm": 0.2613544166088104}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 14722.650678995997, "train/update_time": 6663.450256440206, "train/lr": 3.069872506157217e-05, "train/loss": 3.933227300643921, "train/global_grad_norm": 0.24726922810077667}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 14787.651788089075, "train/update_time": 6728.263910608133, "train/lr": 2.5461924009435368e-05, "train/loss": 3.8415422439575195, "train/global_grad_norm": 0.23274295032024384}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 15074.545659936033, "train/update_time": 6823.377995942952, "train/lr": 2.0702942574950812e-05, "train/loss": 3.8856899738311768, "train/global_grad_norm": 0.22777116298675537}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 15149.746909548994, "train/update_time": 6898.238976031076, "train/lr": 1.642657264902142e-05, "train/loss": 3.900519609451294, "train/global_grad_norm": 0.20975108444690704}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 15408.68530230096, "train/update_time": 6993.2997437083395, "train/lr": 1.2637120173670358e-05, "train/loss": 3.8927230834960938, "train/global_grad_norm": 0.2100864201784134}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 15503.805718103074, "train/update_time": 7088.071415620041, "train/lr": 9.338400806321978e-06, "train/loss": 3.8557724952697754, "train/global_grad_norm": 0.2212178111076355}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 15744.901314089075, "train/update_time": 7164.0845861389535, "train/lr": 6.533736077758867e-06, "train/loss": 3.863615036010742, "train/global_grad_norm": 0.19669552147388458}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 15840.40012625407, "train/update_time": 7259.235320164589, "train/lr": 4.2259500476214406e-06, "train/loss": 3.86468768119812, "train/global_grad_norm": 0.1998816430568695}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 16095.937076770002, "train/update_time": 7323.949722252553, "train/lr": 2.417366460819359e-06, "train/loss": 3.8953840732574463, "train/global_grad_norm": 0.1937154233455658}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 16164.016768118017, "train/update_time": 7391.847917348379, "train/lr": 1.1098064077174619e-06, "train/loss": 3.868272066116333, "train/global_grad_norm": 0.1938706636428833}
metrics/jsonlines/train_data_info.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024}
metrics/jsonlines/train_eval.jsonl ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 390.50471019698307, "train_eval/train_update_time": 198.95112687454093, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.220428141422927, "train_eval/perplexity_len_2048": 3716.093055338976, "train_eval/loss_avg_len_1024": 8.220684633320126, "train_eval/perplexity_len_1024": 3717.046325344697, "train_eval/loss_avg_len_512": 8.220186968173365, "train_eval/perplexity_len_512": 3715.1969411648747}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 948.4334847090067, "train_eval/train_update_time": 445.5143054035725, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.349791028721629, "train_eval/perplexity_len_2048": 572.3730869796306, "train_eval/loss_avg_len_1024": 6.352041424442651, "train_eval/perplexity_len_1024": 573.6626033422435, "train_eval/loss_avg_len_512": 6.355051661818289, "train_eval/perplexity_len_512": 575.3920656919684}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1912.9173058250453, "train_eval/train_update_time": 882.7945512202568, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.6142453497615135, "train_eval/perplexity_len_2048": 274.30629575591456, "train_eval/loss_avg_len_1024": 5.618545619335018, "train_eval/perplexity_len_1024": 275.4884266925292, "train_eval/loss_avg_len_512": 5.627640858957238, "train_eval/perplexity_len_512": 278.0054892383082}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2675.1611660569906, "train_eval/train_update_time": 1250.1728800584096, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.210239426671869, "train_eval/perplexity_len_2048": 183.1379010430748, "train_eval/loss_avg_len_1024": 5.2175741658795225, "train_eval/perplexity_len_1024": 184.4861081156239, "train_eval/loss_avg_len_512": 5.230584617780041, "train_eval/perplexity_len_512": 186.90203784134167}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3647.1437564629596, "train_eval/train_update_time": 1654.5446031297324, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.950060198554256, "train_eval/perplexity_len_2048": 141.1834627060068, "train_eval/loss_avg_len_1024": 4.957617836368627, "train_eval/perplexity_len_1024": 142.25451841003058, "train_eval/loss_avg_len_512": 4.973734723227462, "train_eval/perplexity_len_512": 144.5657936390457}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4400.52537979302, "train_eval/train_update_time": 2048.153126588324, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.768799402314344, "train_eval/perplexity_len_2048": 117.77775334412672, "train_eval/loss_avg_len_1024": 4.77747150756757, "train_eval/perplexity_len_1024": 118.80357600973609, "train_eval/loss_avg_len_512": 4.79741792121502, "train_eval/perplexity_len_512": 121.19707276202257}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5380.30665975099, "train_eval/train_update_time": 2420.687880878686, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.617228222046706, "train_eval/perplexity_len_2048": 101.21310272572127, "train_eval/loss_avg_len_1024": 4.631316462856958, "train_eval/perplexity_len_1024": 102.64910893945788, "train_eval/loss_avg_len_512": 4.657994357578791, "train_eval/perplexity_len_512": 105.42442627303583}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6148.81527663907, "train_eval/train_update_time": 2858.1190680917352, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.481916864582235, "train_eval/perplexity_len_2048": 88.40396880166443, "train_eval/loss_avg_len_1024": 4.499823768692404, "train_eval/perplexity_len_1024": 90.00126886153693, "train_eval/loss_avg_len_512": 4.535654497509022, "train_eval/perplexity_len_512": 93.28454983426795}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7122.763824166963, "train_eval/train_update_time": 3212.485016722814, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.34591162732665, "train_eval/perplexity_len_2048": 77.16234873129532, "train_eval/loss_avg_len_1024": 4.374533263420053, "train_eval/perplexity_len_1024": 79.40277072459905, "train_eval/loss_avg_len_512": 4.424866182332263, "train_eval/perplexity_len_512": 83.50163248065816}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7890.565265796031, "train_eval/train_update_time": 3649.793015780393, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.224437785038818, "train_eval/perplexity_len_2048": 68.33607318377503, "train_eval/loss_avg_len_1024": 4.259692610379098, "train_eval/perplexity_len_1024": 70.788220545324, "train_eval/loss_avg_len_512": 4.321633174280869, "train_eval/perplexity_len_512": 75.31152475431779}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8843.699362242012, "train_eval/train_update_time": 3992.2848253480624, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.123584927133725, "train_eval/perplexity_len_2048": 61.78032370601044, "train_eval/loss_avg_len_1024": 4.164638183943371, "train_eval/perplexity_len_1024": 64.36938835136732, "train_eval/loss_avg_len_512": 4.233584939141074, "train_eval/perplexity_len_512": 68.96402136694488}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9624.041267768014, "train_eval/train_update_time": 4419.823843153426, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.051551892079196, "train_eval/perplexity_len_2048": 57.48660085729786, "train_eval/loss_avg_len_1024": 4.095574334702723, "train_eval/perplexity_len_1024": 60.07383173765205, "train_eval/loss_avg_len_512": 4.172065682189205, "train_eval/perplexity_len_512": 64.849271831113}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10567.73318944103, "train_eval/train_update_time": 4787.056737843552, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.0031533505732115, "train_eval/perplexity_len_2048": 54.77058887802388, "train_eval/loss_avg_len_1024": 4.055146144072587, "train_eval/perplexity_len_1024": 57.69359395686366, "train_eval/loss_avg_len_512": 4.1351064839221, "train_eval/perplexity_len_512": 62.496245561505}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11352.440668291994, "train_eval/train_update_time": 5181.94938044867, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9615584894643505, "train_eval/perplexity_len_2048": 52.539143878548686, "train_eval/loss_avg_len_1024": 4.015400523946809, "train_eval/perplexity_len_1024": 55.44549820559399, "train_eval/loss_avg_len_512": 4.09918728835124, "train_eval/perplexity_len_512": 60.291268264731514}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12294.615799573017, "train_eval/train_update_time": 5594.8495038923575, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9327660328167715, "train_eval/perplexity_len_2048": 51.04798296276989, "train_eval/loss_avg_len_1024": 3.9876920471629638, "train_eval/perplexity_len_1024": 53.93027708301155, "train_eval/loss_avg_len_512": 4.0733349682766855, "train_eval/perplexity_len_512": 58.75257420247376}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 13067.235377178993, "train_eval/train_update_time": 5947.199882825487, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9127354414250295, "train_eval/perplexity_len_2048": 50.03563449198435, "train_eval/loss_avg_len_1024": 3.9701769516180687, "train_eval/perplexity_len_1024": 52.99390736771229, "train_eval/loss_avg_len_512": 4.05639126611859, "train_eval/perplexity_len_512": 57.76547426320301}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 14017.66551920597, "train_eval/train_update_time": 6385.36848655506, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.8916031014723105, "train_eval/perplexity_len_2048": 48.98935852041114, "train_eval/loss_avg_len_1024": 3.943297847472768, "train_eval/perplexity_len_1024": 51.58845192166155, "train_eval/loss_avg_len_512": 4.03021627167669, "train_eval/perplexity_len_512": 56.27308020458836}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 14787.651788089075, "train_eval/train_update_time": 6728.263910608133, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.887652821134261, "train_eval/perplexity_len_2048": 48.79621855034521, "train_eval/loss_avg_len_1024": 3.942871247186413, "train_eval/perplexity_len_1024": 51.56644896686626, "train_eval/loss_avg_len_512": 4.030449956292105, "train_eval/perplexity_len_512": 56.2862318943081}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 15744.901314089075, "train_eval/train_update_time": 7164.0845861389535, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.87659640114578, "train_eval/perplexity_len_2048": 48.259678635263626, "train_eval/loss_avg_len_1024": 3.935162933412721, "train_eval/perplexity_len_1024": 51.17048665949723, "train_eval/loss_avg_len_512": 4.022004130411478, "train_eval/perplexity_len_512": 55.8128500445265}
metrics/jsonlines/val.jsonl ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 82.20607728499454, "val/train_update_time": 81.85936604591552, "val/loss": 7.979723100171797, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 95.93819073901977, "val/val_tokens_per_second": 426941.5514768597, "val/loss_avg_len_2048": 7.979723100171797, "val/perplexity_len_2048": 2921.1220938788365, "val/loss_avg_len_1024": 7.9782149463132495, "val/perplexity_len_1024": 2916.719912739308, "val/loss_avg_len_512": 7.978189364168048, "val/perplexity_len_512": 2916.6452977413983}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 256.55152114806697, "val/train_update_time": 159.95920802967157, "val/loss": 7.102240487235458, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 94.82366469805129, "val/val_tokens_per_second": 431959.6814827783, "val/loss_avg_len_2048": 7.102240487235458, "val/perplexity_len_2048": 1214.6855154317202, "val/loss_avg_len_1024": 7.101038199963002, "val/perplexity_len_1024": 1213.2259920554798, "val/loss_avg_len_512": 7.1017793140655385, "val/perplexity_len_512": 1214.125464212337}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 429.63876787002664, "val/train_update_time": 237.9422758014407, "val/loss": 6.6164520517854015, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 95.0188044749666, "val/val_tokens_per_second": 431072.56743891374, "val/loss_avg_len_2048": 6.6164520517854015, "val/perplexity_len_2048": 747.2890452968686, "val/loss_avg_len_1024": 6.615967804358434, "val/perplexity_len_1024": 746.9272601033266, "val/loss_avg_len_512": 6.618594851994235, "val/perplexity_len_512": 748.8920532687349}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 603.0225816690363, "val/train_update_time": 316.03332217247225, "val/loss": 6.220231745957513, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 215.60633486497682, "val/val_tokens_per_second": 189975.86516022892, "val/loss_avg_len_2048": 6.220231745957513, "val/perplexity_len_2048": 502.8197449622685, "val/loss_avg_len_1024": 6.220763633284112, "val/perplexity_len_1024": 503.0872595496804, "val/loss_avg_len_512": 6.225233913274854, "val/perplexity_len_512": 505.34123465604495}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 948.4334847090067, "val/train_update_time": 445.5143054035725, "val/loss": 5.897991514976999, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 196.36531956295948, "val/val_tokens_per_second": 208590.80458383707, "val/loss_avg_len_2048": 5.897991514976999, "val/perplexity_len_2048": 364.30503136837575, "val/loss_avg_len_1024": 5.900186072487478, "val/perplexity_len_1024": 365.10539761472984, "val/loss_avg_len_512": 5.907569861380198, "val/perplexity_len_512": 367.8112361710568}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1307.4146166159771, "val/train_update_time": 606.9371437055524, "val/loss": 5.643392177975154, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 171.1923365429975, "val/val_tokens_per_second": 239263.0466242412, "val/loss_avg_len_2048": 5.643392177975154, "val/perplexity_len_2048": 282.41911132000007, "val/loss_avg_len_1024": 5.646940343820257, "val/perplexity_len_1024": 283.422961024245, "val/loss_avg_len_512": 5.65679239423871, "val/perplexity_len_512": 286.2290585357919}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1668.9536387879634, "val/train_update_time": 796.7456705565564, "val/loss": 5.455643447820377, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 157.57225277402904, "val/val_tokens_per_second": 259944.24322117073, "val/loss_avg_len_2048": 5.455643447820377, "val/perplexity_len_2048": 234.0754379536635, "val/loss_avg_len_1024": 5.460278425578866, "val/perplexity_len_1024": 235.1628906150237, "val/loss_avg_len_512": 5.471461108097714, "val/perplexity_len_512": 237.8074013645763}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 2008.152192622074, "val/train_update_time": 977.6825634960551, "val/loss": 5.301519949414348, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 183.3988890410401, "val/val_tokens_per_second": 223338.32126340838, "val/loss_avg_len_2048": 5.301519949414348, "val/perplexity_len_2048": 200.64154332275467, "val/loss_avg_len_1024": 5.307266004482656, "val/perplexity_len_1024": 201.79775933888274, "val/loss_avg_len_512": 5.319740491560195, "val/perplexity_len_512": 204.3308495321139}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 2335.2169849510537, "val/train_update_time": 1120.8591244373238, "val/loss": 5.1668799918430395, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 210.14631871005986, "val/val_tokens_per_second": 194911.81311870972, "val/loss_avg_len_2048": 5.1668799918430395, "val/perplexity_len_2048": 175.36683708358458, "val/loss_avg_len_1024": 5.173995411513188, "val/perplexity_len_1024": 176.61909561475397, "val/loss_avg_len_512": 5.188556263851468, "val/perplexity_len_512": 179.20963464425918}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2675.1611660569906, "val/train_update_time": 1250.1728800584096, "val/loss": 5.0615114452362295, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 214.48142744007055, "val/val_tokens_per_second": 190972.24635659822, "val/loss_avg_len_2048": 5.0615114452362295, "val/perplexity_len_2048": 157.828885854406, "val/loss_avg_len_1024": 5.069597413221841, "val/perplexity_len_1024": 159.11025876113365, "val/loss_avg_len_512": 5.08556063876981, "val/perplexity_len_512": 161.6705526142726}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 3026.331188999, "val/train_update_time": 1385.8219667483354, "val/loss": 4.978800045607867, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 188.47243235306814, "val/val_tokens_per_second": 217326.21311571464, "val/loss_avg_len_2048": 4.978800045607867, "val/perplexity_len_2048": 145.2999237221159, "val/loss_avg_len_1024": 4.987759131970396, "val/perplexity_len_1024": 146.60752700634214, "val/loss_avg_len_512": 5.004984278857336, "val/perplexity_len_512": 149.15473825703648}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 3389.030513621052, "val/train_update_time": 1559.5160488487454, "val/loss": 4.8876310169785055, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 162.8790130940033, "val/val_tokens_per_second": 251475.0011185328, "val/loss_avg_len_2048": 4.8876310169785055, "val/perplexity_len_2048": 132.6389820703653, "val/loss_avg_len_1024": 4.897714207779151, "val/perplexity_len_1024": 133.98317170575723, "val/loss_avg_len_512": 4.917036822233256, "val/perplexity_len_512": 136.59725096585308}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 3742.4888608519686, "val/train_update_time": 1749.5527433896204, "val/loss": 4.811935229389766, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 166.07114278594963, "val/val_tokens_per_second": 246641.2846498784, "val/loss_avg_len_2048": 4.811935229389766, "val/perplexity_len_2048": 122.96936132073786, "val/loss_avg_len_1024": 4.8228395102404065, "val/perplexity_len_1024": 124.31759115190877, "val/loss_avg_len_512": 4.843568500057608, "val/perplexity_len_512": 126.92146382028584}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 4077.975462540984, "val/train_update_time": 1918.29994701338, "val/loss": 4.748163937986432, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 192.17725714098196, "val/val_tokens_per_second": 213136.5626159998, "val/loss_avg_len_2048": 4.748163937986432, "val/perplexity_len_2048": 115.3722593182815, "val/loss_avg_len_1024": 4.760371447979985, "val/perplexity_len_1024": 116.78929899226179, "val/loss_avg_len_512": 4.783200043344404, "val/perplexity_len_512": 119.48609960927246}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 4400.52537979302, "val/train_update_time": 2048.153126588324, "val/loss": 4.68768829575642, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 219.41327350703068, "val/val_tokens_per_second": 186679.68143088443, "val/loss_avg_len_2048": 4.68768829575642, "val/perplexity_len_2048": 108.60183408015227, "val/loss_avg_len_1024": 4.701050255188998, "val/perplexity_len_1024": 110.06270569958359, "val/loss_avg_len_512": 4.726191300578602, "val/perplexity_len_512": 112.86487430717956}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 4750.569092386053, "val/train_update_time": 2177.56818451453, "val/loss": 4.632592438739213, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 205.87698022800032, "val/val_tokens_per_second": 198953.76333302772, "val/loss_avg_len_2048": 4.632592438739213, "val/perplexity_len_2048": 102.78017032460092, "val/loss_avg_len_1024": 4.6476780188167, "val/perplexity_len_1024": 104.3424229304824, "val/loss_avg_len_512": 4.675603326033708, "val/perplexity_len_512": 107.29728281180869}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 5105.305374011979, "val/train_update_time": 2326.03590223263, "val/loss": 4.573390058158175, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 180.13931755896192, "val/val_tokens_per_second": 227379.56685437795, "val/loss_avg_len_2048": 4.573390058158175, "val/perplexity_len_2048": 96.87195531354858, "val/loss_avg_len_1024": 4.59018697572723, "val/perplexity_len_1024": 98.51284795144008, "val/loss_avg_len_512": 4.620820440272987, "val/perplexity_len_512": 101.57733608926145}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 5471.657564450987, "val/train_update_time": 2511.68601841887, "val/loss": 4.52100637258871, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 154.91730077692773, "val/val_tokens_per_second": 264399.1329217652, "val/loss_avg_len_2048": 4.52100637258871, "val/perplexity_len_2048": 91.92806532712832, "val/loss_avg_len_1024": 4.539745244210261, "val/perplexity_len_1024": 93.66693488573054, "val/loss_avg_len_512": 4.57357738015512, "val/perplexity_len_512": 96.89010326136773}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 5816.983929526061, "val/train_update_time": 2701.5302643649047, "val/loss": 4.475736114661373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 174.7030213919934, "val/val_tokens_per_second": 234455.01785624636, "val/loss_avg_len_2048": 4.475736114661373, "val/perplexity_len_2048": 87.85925109454497, "val/loss_avg_len_1024": 4.4973341475088615, "val/perplexity_len_1024": 89.77747848825223, "val/loss_avg_len_512": 4.535955932896119, "val/perplexity_len_512": 93.31267333715398}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 6148.81527663907, "val/train_update_time": 2858.1190680917352, "val/loss": 4.413959308819939, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 200.30809704202693, "val/val_tokens_per_second": 204484.99389121612, "val/loss_avg_len_2048": 4.413959308819939, "val/perplexity_len_2048": 82.59583940179142, "val/loss_avg_len_1024": 4.438946398508316, "val/perplexity_len_1024": 84.68566970562067, "val/loss_avg_len_512": 4.482290532938484, "val/perplexity_len_512": 88.43700873997773}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 6479.875744563062, "val/train_update_time": 2987.694357936969, "val/loss": 4.3640230217037725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 219.93532326899003, "val/val_tokens_per_second": 186236.56896579647, "val/loss_avg_len_2048": 4.3640230217037725, "val/perplexity_len_2048": 78.5725987077007, "val/loss_avg_len_1024": 4.392694518165058, "val/perplexity_len_1024": 80.85799905935154, "val/loss_avg_len_512": 4.44163951446591, "val/perplexity_len_512": 84.91404541718936}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 6829.788738840027, "val/train_update_time": 3117.297304859967, "val/loss": 4.306920868678578, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 197.5680843789596, "val/val_tokens_per_second": 207320.93510323126, "val/loss_avg_len_2048": 4.306920868678578, "val/perplexity_len_2048": 74.21162942433823, "val/loss_avg_len_1024": 4.339483561339742, "val/perplexity_len_1024": 76.66793482697868, "val/loss_avg_len_512": 4.393889478641376, "val/perplexity_len_512": 80.95467892522656}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 7188.301673018024, "val/train_update_time": 3277.705760344048, "val/loss": 4.2564721112608215, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 171.91542205994483, "val/val_tokens_per_second": 238256.69337401123, "val/loss_avg_len_2048": 4.2564721112608215, "val/perplexity_len_2048": 70.56061384378495, "val/loss_avg_len_1024": 4.292763091740292, "val/perplexity_len_1024": 73.16836034045308, "val/loss_avg_len_512": 4.352126901028119, "val/perplexity_len_512": 77.64342731699331}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 7550.796140610008, "val/train_update_time": 3467.7480247288477, "val/loss": 4.212071916596289, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 157.03932671796065, "val/val_tokens_per_second": 260826.38569613398, "val/loss_avg_len_2048": 4.212071916596289, "val/perplexity_len_2048": 67.49624161808859, "val/loss_avg_len_1024": 4.251795557039651, "val/perplexity_len_1024": 70.23140369181802, "val/loss_avg_len_512": 4.31613794729123, "val/perplexity_len_512": 74.89880586144663}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 7890.565265796031, "val/train_update_time": 3649.793015780393, "val/loss": 4.171272234549187, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 182.64849553606473, "val/val_tokens_per_second": 224255.8849432859, "val/loss_avg_len_2048": 4.171272234549187, "val/perplexity_len_2048": 64.79783773725036, "val/loss_avg_len_1024": 4.213818998173346, "val/perplexity_len_1024": 67.61426612754268, "val/loss_avg_len_512": 4.28175293422537, "val/perplexity_len_512": 72.36718380138258}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 8219.336814008071, "val/train_update_time": 3794.7164617268136, "val/loss": 4.134542551958072, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 208.7136578100035, "val/val_tokens_per_second": 196249.73482706514, "val/loss_avg_len_2048": 4.134542551958072, "val/perplexity_len_2048": 62.46101186663823, "val/loss_avg_len_1024": 4.179829636917729, "val/perplexity_len_1024": 65.35471823430277, "val/loss_avg_len_512": 4.250723877273966, "val/perplexity_len_512": 70.15617843345912}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 8557.932057484053, "val/train_update_time": 3924.0804488240974, "val/loss": 4.103749194115936, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 217.37881806690712, "val/val_tokens_per_second": 188426.82265110535, "val/loss_avg_len_2048": 4.103749194115936, "val/perplexity_len_2048": 60.566939664380236, "val/loss_avg_len_1024": 4.151865726310574, "val/perplexity_len_1024": 63.55246125496358, "val/loss_avg_len_512": 4.225990942367353, "val/perplexity_len_512": 68.44229232280817}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 8908.472957947059, "val/train_update_time": 4056.885919503169, "val/loss": 4.073220567722759, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 188.46028788900003, "val/val_tokens_per_second": 217340.2177127351, "val/loss_avg_len_2048": 4.073220567722759, "val/perplexity_len_2048": 58.745853259887475, "val/loss_avg_len_1024": 4.122964682882419, "val/perplexity_len_1024": 61.74201669646727, "val/loss_avg_len_512": 4.199377771161869, "val/perplexity_len_512": 66.64484978939613}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 9270.280366463005, "val/train_update_time": 4229.695196601329, "val/loss": 4.050425243469537, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 163.08659516496118, "val/val_tokens_per_second": 251154.91532930214, "val/loss_avg_len_2048": 4.050425243469537, "val/perplexity_len_2048": 57.4218701296072, "val/loss_avg_len_1024": 4.102036605390021, "val/perplexity_len_1024": 60.4633021758754, "val/loss_avg_len_512": 4.180863835545815, "val/perplexity_len_512": 65.42234295690989}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 9624.041267768014, "val/train_update_time": 4419.823843153426, "val/loss": 4.027506289458648, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 165.3425285939593, "val/val_tokens_per_second": 247728.15771184748, "val/loss_avg_len_2048": 4.027506289458648, "val/perplexity_len_2048": 56.12078760619935, "val/loss_avg_len_1024": 4.080373891314026, "val/perplexity_len_1024": 59.167587961950446, "val/loss_avg_len_512": 4.1605846759307195, "val/perplexity_len_512": 64.10899463047505}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 9960.90240954503, "val/train_update_time": 4590.094068991486, "val/loss": 4.009245940525528, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 190.97150057496037, "val/val_tokens_per_second": 214482.26503264203, "val/loss_avg_len_2048": 4.009245940525528, "val/perplexity_len_2048": 55.10530221770008, "val/loss_avg_len_1024": 4.063811174258451, "val/perplexity_len_1024": 58.19568285625148, "val/loss_avg_len_512": 4.145707397240494, "val/perplexity_len_512": 63.162286929400935}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 10284.48041301698, "val/train_update_time": 4722.194782613544, "val/loss": 3.9909103549039924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 218.06762579700444, "val/val_tokens_per_second": 187831.64098887832, "val/loss_avg_len_2048": 3.9909103549039924, "val/perplexity_len_2048": 54.10412090262984, "val/loss_avg_len_1024": 4.045858511323854, "val/perplexity_len_1024": 57.16023768676083, "val/loss_avg_len_512": 4.128349488141108, "val/perplexity_len_512": 62.07538218123893}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 10632.616937980056, "val/train_update_time": 4851.760493494687, "val/loss": 3.975969088495243, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 208.42480857099872, "val/val_tokens_per_second": 196521.71102293328, "val/loss_avg_len_2048": 3.975969088495243, "val/perplexity_len_2048": 53.30174599393647, "val/loss_avg_len_1024": 4.03235973079307, "val/perplexity_len_1024": 56.39382861476925, "val/loss_avg_len_512": 4.116566236323026, "val/perplexity_len_512": 61.34822487362218}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 10987.034303960972, "val/train_update_time": 4997.368006910896, "val/loss": 3.961669144834392, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 180.25853680295404, "val/val_tokens_per_second": 227229.18274197794, "val/loss_avg_len_2048": 3.961669144834392, "val/perplexity_len_2048": 52.54495793862879, "val/loss_avg_len_1024": 4.0187124839128, "val/perplexity_len_1024": 55.62943590495806, "val/loss_avg_len_512": 4.103610272248183, "val/perplexity_len_512": 60.558526176420216}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 11352.440668291994, "val/train_update_time": 5181.94938044867, "val/loss": 3.949665504408349, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 154.89441621606238, "val/val_tokens_per_second": 264438.19603454816, "val/loss_avg_len_2048": 3.949665504408349, "val/perplexity_len_2048": 51.91799758870448, "val/loss_avg_len_1024": 4.007118317455333, "val/perplexity_len_1024": 54.988183541790946, "val/loss_avg_len_512": 4.0924119246510795, "val/perplexity_len_512": 59.884153726227304}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 11698.40989001107, "val/train_update_time": 5371.777714943513, "val/loss": 3.9387186932430605, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 172.71028752799612, "val/val_tokens_per_second": 237160.1633363064, "val/loss_avg_len_2048": 3.9387186932430605, "val/perplexity_len_2048": 51.35276048939409, "val/loss_avg_len_1024": 3.9968486371898093, "val/perplexity_len_1024": 54.426362278565, "val/loss_avg_len_512": 4.0829927696230826, "val/perplexity_len_512": 59.32274375305282}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 12029.831404851982, "val/train_update_time": 5529.962416259223, "val/loss": 3.928844289263571, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 199.5733669550391, "val/val_tokens_per_second": 205237.80615089627, "val/loss_avg_len_2048": 3.928844289263571, "val/perplexity_len_2048": 50.848177912849586, "val/loss_avg_len_1024": 3.98748852977464, "val/perplexity_len_1024": 53.91930245067018, "val/loss_avg_len_512": 4.0740872424433014, "val/perplexity_len_512": 58.796788874961265}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 12359.526009666035, "val/train_update_time": 5659.566051078378, "val/loss": 3.920881012235559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 219.83646175300237, "val/val_tokens_per_second": 186320.32044811873, "val/loss_avg_len_2048": 3.920881012235559, "val/perplexity_len_2048": 50.44486775231113, "val/loss_avg_len_1024": 3.9801359154624865, "val/perplexity_len_1024": 53.52430851428192, "val/loss_avg_len_512": 4.067399822572339, "val/perplexity_len_512": 58.40490187761954}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 12709.197320922045, "val/train_update_time": 5789.02060880058, "val/loss": 3.9135911652927984, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 199.4685426549986, "val/val_tokens_per_second": 205345.66230247414, "val/loss_avg_len_2048": 3.9135911652927984, "val/perplexity_len_2048": 50.078469503515095, "val/loss_avg_len_1024": 3.9731324667307084, "val/perplexity_len_1024": 53.15076334273877, "val/loss_avg_len_512": 4.06081120250253, "val/perplexity_len_512": 58.02135906533112}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 13067.235377178993, "val/train_update_time": 5947.199882825487, "val/loss": 3.9075045806031676, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 171.969670202001, "val/val_tokens_per_second": 238181.53487116125, "val/loss_avg_len_2048": 3.9075045806031676, "val/perplexity_len_2048": 49.774588394950335, "val/loss_avg_len_1024": 3.967167426846549, "val/perplexity_len_1024": 52.83466063943094, "val/loss_avg_len_512": 4.054981480225362, "val/perplexity_len_512": 57.684094689836876}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 13430.399093109067, "val/train_update_time": 6137.126117701177, "val/loss": 3.9023538076302042, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 156.54619255301077, "val/val_tokens_per_second": 261648.01156776672, "val/loss_avg_len_2048": 3.9023538076302042, "val/perplexity_len_2048": 49.5188699295459, "val/loss_avg_len_1024": 3.962441469050851, "val/perplexity_len_1024": 52.585555357264404, "val/loss_avg_len_512": 4.050545121595823, "val/perplexity_len_512": 57.42875416842078}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 13771.155967495986, "val/train_update_time": 6320.664257825236, "val/loss": 3.8977976139097947, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 181.48743707104586, "val/val_tokens_per_second": 225690.5528065043, "val/loss_avg_len_2048": 3.8977976139097947, "val/perplexity_len_2048": 49.29376556428864, "val/loss_avg_len_1024": 3.9583502489584506, "val/perplexity_len_1024": 52.37085576772552, "val/loss_avg_len_512": 4.046992695208081, "val/perplexity_len_512": 57.22510468576255}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 14099.15276560001, "val/train_update_time": 6466.659589443123, "val/loss": 3.894296748715709, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 208.2782687600702, "val/val_tokens_per_second": 196659.97918959367, "val/loss_avg_len_2048": 3.894296748715709, "val/perplexity_len_2048": 49.12149645754408, "val/loss_avg_len_1024": 3.954908422886068, "val/perplexity_len_1024": 52.19091423228003, "val/loss_avg_len_512": 4.043593764402159, "val/perplexity_len_512": 57.03093069349264}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 14437.422251214972, "val/train_update_time": 6596.149574242183, "val/loss": 3.8914951809667055, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 217.73421279306058, "val/val_tokens_per_second": 188119.26465102338, "val/loss_avg_len_2048": 3.8914951809667055, "val/perplexity_len_2048": 48.9840718493456, "val/loss_avg_len_1024": 3.9522341961787086, "val/perplexity_len_1024": 52.05153035067479, "val/loss_avg_len_512": 4.041106043229532, "val/perplexity_len_512": 56.889229968737666}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 14787.651788089075, "val/train_update_time": 6728.263910608133, "val/loss": 3.889307251223922, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 190.86575583100785, "val/val_tokens_per_second": 214601.09395561717, "val/loss_avg_len_2048": 3.889307251223922, "val/perplexity_len_2048": 48.8770153004345, "val/loss_avg_len_1024": 3.9500711962865664, "val/perplexity_len_1024": 51.93906457172255, "val/loss_avg_len_512": 4.0389501331874165, "val/perplexity_len_512": 56.766714020695176}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 15149.746909548994, "val/train_update_time": 6898.238976031076, "val/loss": 3.8877973514226962, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 163.65899614198133, "val/val_tokens_per_second": 250276.49542995734, "val/loss_avg_len_2048": 3.8877973514226962, "val/perplexity_len_2048": 48.8032715915636, "val/loss_avg_len_1024": 3.9486364330179993, "val/perplexity_len_1024": 51.86459774358062, "val/loss_avg_len_512": 4.037612269980088, "val/perplexity_len_512": 56.69081870270302}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 15503.805718103074, "val/train_update_time": 7088.071415620041, "val/loss": 3.8867285964663605, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 164.7503122780472, "val/val_tokens_per_second": 248618.64863037274, "val/loss_avg_len_2048": 3.8867285964663605, "val/perplexity_len_2048": 48.75114071569285, "val/loss_avg_len_1024": 3.947595182389999, "val/perplexity_len_1024": 51.81062180472598, "val/loss_avg_len_512": 4.036548712050356, "val/perplexity_len_512": 56.63055678462964}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 15840.40012625407, "val/train_update_time": 7259.235320164589, "val/loss": 3.886128832803201, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 190.49678600998595, "val/val_tokens_per_second": 215016.75098000263, "val/loss_avg_len_2048": 3.886128832803201, "val/perplexity_len_2048": 48.72191031949497, "val/loss_avg_len_1024": 3.947077749578096, "val/perplexity_len_1024": 51.78382022360586, "val/loss_avg_len_512": 4.036122298393399, "val/perplexity_len_512": 56.60641388961146}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 16164.016768118017, "val/train_update_time": 7391.847917348379, "val/loss": 3.885858454372711, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 217.6126034669578, "val/val_tokens_per_second": 188224.39209601821, "val/loss_avg_len_2048": 3.885858454372711, "val/perplexity_len_2048": 48.70873874658715, "val/loss_avg_len_1024": 3.9468075397948734, "val/perplexity_len_1024": 51.76982961905303, "val/loss_avg_len_512": 4.035868617015797, "val/perplexity_len_512": 56.592055717832274}
metrics/jsonlines/val_data_info.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "val_data_info/vocab_size": 50277, "val_data_info/global_tokens_per_batch": 2048, "val_data_info/local_tokens_per_batch": 2048, "val_data_info/batch_len": 2048, "val_data_info/seq_len": 2048, "val_data_info/total_tokens": 2147483648, "val_data_info/global_batch_size": 1, "val_data_info/local_batch_size": 1}
metrics/npz/train_eval/step-000000104857600.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e566b34ec86cdedc44a9dbd0ad5cfd2c5336e504ba317a47ed63e4f63c2f0606
3
+ size 20540
metrics/npz/train_eval/step-000000209715200.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cb0288f59c1d0619384eef47849178b7e84c75bbbf547f5b050dc0d1b4ce582
3
+ size 20540