Lanni-ni commited on
Commit
efc5e67
·
verified ·
1 Parent(s): 7c81fd0

add remote code + model files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .hydra/config.yaml +102 -0
  2. .hydra/hydra.yaml +146 -0
  3. .hydra/overrides.yaml +16 -0
  4. __init__.py +1 -0
  5. __pycache__/__init__.cpython-310.pyc +0 -0
  6. __pycache__/configuration_transformer.cpython-310.pyc +0 -0
  7. __pycache__/modeling_transformer.cpython-310.pyc +0 -0
  8. checkpoints/step-000000209715200.pt +3 -0
  9. checkpoints/step-000000209715200.pt.done +0 -0
  10. checkpoints/step-000000209715200.pt.keep +0 -0
  11. checkpoints/step-000000419430400.pt +3 -0
  12. checkpoints/step-000000419430400.pt.done +0 -0
  13. checkpoints/step-000000419430400.pt.keep +0 -0
  14. checkpoints/step-000000629145600.pt +3 -0
  15. checkpoints/step-000000629145600.pt.done +0 -0
  16. checkpoints/step-000000629145600.pt.keep +0 -0
  17. checkpoints/step-000000838860800.pt +3 -0
  18. checkpoints/step-000000838860800.pt.done +0 -0
  19. checkpoints/step-000000838860800.pt.keep +0 -0
  20. checkpoints/step-000001048576000.pt +3 -0
  21. checkpoints/step-000001048576000.pt.done +0 -0
  22. checkpoints/step-000001048576000.pt.keep +0 -0
  23. checkpoints/step-000001258291200.pt +3 -0
  24. checkpoints/step-000001258291200.pt.done +0 -0
  25. checkpoints/step-000001258291200.pt.keep +0 -0
  26. checkpoints/step-000001468006400.pt +3 -0
  27. checkpoints/step-000001468006400.pt.done +0 -0
  28. checkpoints/step-000001468006400.pt.keep +0 -0
  29. checkpoints/step-000001677721600.pt +3 -0
  30. checkpoints/step-000001677721600.pt.done +0 -0
  31. checkpoints/step-000001677721600.pt.keep +0 -0
  32. checkpoints/step-000001887436800.pt +3 -0
  33. checkpoints/step-000001887436800.pt.done +0 -0
  34. checkpoints/step-000001887436800.pt.keep +0 -0
  35. config.yaml +102 -0
  36. configuration_transformer.py +67 -0
  37. decay_params.txt +44 -0
  38. logs/2025-10-17_13-25-22.log +258 -0
  39. metrics/jsonlines/checkpoint.jsonl +9 -0
  40. metrics/jsonlines/model_info.jsonl +1 -0
  41. metrics/jsonlines/norm.jsonl +0 -0
  42. metrics/jsonlines/resume.jsonl +1 -0
  43. metrics/jsonlines/throughput.jsonl +0 -0
  44. metrics/jsonlines/train.jsonl +98 -0
  45. metrics/jsonlines/train_data_info.jsonl +1 -0
  46. metrics/jsonlines/train_eval.jsonl +19 -0
  47. metrics/jsonlines/val.jsonl +49 -0
  48. metrics/jsonlines/val_data_info.jsonl +1 -0
  49. metrics/npz/train_eval/step-000000104857600.npz +3 -0
  50. metrics/npz/train_eval/step-000000209715200.npz +3 -0
.hydra/config.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM
3
+ config:
4
+ _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig
5
+ vocab_size: ???
6
+ hidden_size: 512
7
+ hidden_ratio: 4.0
8
+ intermediate_size: null
9
+ num_hidden_layers: 6
10
+ num_heads: 8
11
+ num_kv_heads: null
12
+ hidden_act: swish
13
+ window_size: null
14
+ max_position_embeddings: null
15
+ initializer_range: 0.02
16
+ elementwise_affine: true
17
+ norm_eps: 1.0e-06
18
+ use_cache: true
19
+ pad_token_id: null
20
+ bos_token_id: null
21
+ eos_token_id: null
22
+ tie_word_embeddings: false
23
+ attention_bias: false
24
+ fuse_norm: true
25
+ fuse_cross_entropy: true
26
+ rope_base: 500000.0
27
+ use_rope: false
28
+ use_output_gate: false
29
+ ogate_act: sigmoid
30
+ fgate_type: full
31
+ fgate_bias_init: false
32
+ decay_time_min: null
33
+ decay_time_max: null
34
+ use_output_norm: false
35
+ qk_norm: false
36
+ qk_norm_share_param_across_head: false
37
+ use_k_shift: false
38
+ use_v_shift: false
39
+ optimizer:
40
+ _target_: torch.optim.AdamW
41
+ lr: 0.001
42
+ betas:
43
+ - 0.9
44
+ - 0.95
45
+ weight_decay: 0.1
46
+ schedule:
47
+ _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule
48
+ init_value: 0.0
49
+ peak_value: ${optimizer.lr}
50
+ warmup_steps: 20971520
51
+ decay_steps: ${train.max_tokens}
52
+ end_value: 0.0
53
+ datamodule:
54
+ _target_: forgetting_transformer.datamodule.npy.NpyDataModule
55
+ data_path: ${data_dir}
56
+ rank: ???
57
+ world_size: ???
58
+ train_batch_len: 2048
59
+ train_batch_size: 1024
60
+ train_num_workers: 0
61
+ eval_tokens: 2147483648
62
+ eval_batch_len: 2048
63
+ eval_local_batch_size: 1
64
+ eval_num_workers: 0
65
+ strategy:
66
+ _target_: lightning.fabric.strategies.SingleDeviceStrategy
67
+ device: cuda:0
68
+ exp: forgetting_gate_6_8_512
69
+ tag: forgetting_gate_6_8_512
70
+ seed: 42
71
+ hf_load_dir: null
72
+ hf_save_dir: null
73
+ hf_load_step: null
74
+ output_dir: ./forgetting_gate_6_8_512/
75
+ data_dir: /workspace/forgetting-transformer/data
76
+ resume: false
77
+ fork_dir: null
78
+ fork_step: null
79
+ log_interval: 20971520
80
+ eval_interval: 41943040
81
+ final_eval: true
82
+ skip_eval: false
83
+ checkpoint_interval: 209715200
84
+ train_eval_interval: 104857600
85
+ checkpoint_keep_interval: 209715200
86
+ fabric:
87
+ devices: 1
88
+ precision: 16-mixed
89
+ train:
90
+ max_tokens: 2097152000
91
+ grad_acc_tokens: 32768
92
+ max_grad_norm: 1.0
93
+ gradient_checkpointing: true
94
+ bias_weight_decay: false
95
+ normalization_weight_decay: false
96
+ conv_weight_decay: true
97
+ eval:
98
+ min_val_length: 512
99
+ wandb:
100
+ project: forgetting-transformer
101
+ mode: online
102
+ log_dir: ./output/wandb
.hydra/hydra.yaml ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${output_dir}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ root: null
71
+ disable_existing_loggers: false
72
+ job_logging:
73
+ version: 1
74
+ root: null
75
+ disable_existing_loggers: false
76
+ env: {}
77
+ mode: RUN
78
+ searchpath: []
79
+ callbacks: {}
80
+ output_subdir: .hydra
81
+ overrides:
82
+ hydra:
83
+ - hydra.mode=RUN
84
+ task:
85
+ - +experiment/pile/forgetting_transformer=forgetting_gate_6_8_512
86
+ - strategy=single_device
87
+ - datamodule=npy
88
+ - schedule=warmup_cosine
89
+ - optimizer=adamw
90
+ - model=forgetting_transformer
91
+ - data_dir=/workspace/forgetting-transformer/data
92
+ - fabric.devices=1
93
+ - fabric.precision=16-mixed
94
+ - seed=42
95
+ - exp=forgetting_gate_6_8_512
96
+ - tag=forgetting_gate_6_8_512
97
+ - output_dir=./forgetting_gate_6_8_512/
98
+ - wandb.log_dir=./output/wandb
99
+ - wandb.mode=online
100
+ - resume=false
101
+ job:
102
+ name: train
103
+ chdir: null
104
+ override_dirname: +experiment/pile/forgetting_transformer=forgetting_gate_6_8_512,data_dir=/workspace/forgetting-transformer/data,datamodule=npy,exp=forgetting_gate_6_8_512,fabric.devices=1,fabric.precision=16-mixed,model=forgetting_transformer,optimizer=adamw,output_dir=./forgetting_gate_6_8_512/,resume=false,schedule=warmup_cosine,seed=42,strategy=single_device,tag=forgetting_gate_6_8_512,wandb.log_dir=./output/wandb,wandb.mode=online
105
+ id: ???
106
+ num: ???
107
+ config_name: config
108
+ env_set: {}
109
+ env_copy: []
110
+ config:
111
+ override_dirname:
112
+ kv_sep: '='
113
+ item_sep: ','
114
+ exclude_keys: []
115
+ runtime:
116
+ version: 1.3.2
117
+ version_base: '1.3'
118
+ cwd: /workspace/forgetting-transformer
119
+ config_sources:
120
+ - path: hydra.conf
121
+ schema: pkg
122
+ provider: hydra
123
+ - path: /workspace/forgetting-transformer/configs
124
+ schema: file
125
+ provider: main
126
+ - path: ''
127
+ schema: structured
128
+ provider: schema
129
+ output_dir: /workspace/forgetting-transformer/forgetting_gate_6_8_512
130
+ choices:
131
+ experiment/pile/forgetting_transformer: forgetting_gate_6_8_512
132
+ strategy: single_device
133
+ datamodule: npy
134
+ schedule: warmup_cosine
135
+ optimizer: adamw
136
+ model: forgetting_transformer
137
+ hydra/env: default
138
+ hydra/callbacks: null
139
+ hydra/job_logging: none
140
+ hydra/hydra_logging: none
141
+ hydra/hydra_help: default
142
+ hydra/help: default
143
+ hydra/sweeper: basic
144
+ hydra/launcher: basic
145
+ hydra/output: default
146
+ verbose: false
.hydra/overrides.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - +experiment/pile/forgetting_transformer=forgetting_gate_6_8_512
2
+ - strategy=single_device
3
+ - datamodule=npy
4
+ - schedule=warmup_cosine
5
+ - optimizer=adamw
6
+ - model=forgetting_transformer
7
+ - data_dir=/workspace/forgetting-transformer/data
8
+ - fabric.devices=1
9
+ - fabric.precision=16-mixed
10
+ - seed=42
11
+ - exp=forgetting_gate_6_8_512
12
+ - tag=forgetting_gate_6_8_512
13
+ - output_dir=./forgetting_gate_6_8_512/
14
+ - wandb.log_dir=./output/wandb
15
+ - wandb.mode=online
16
+ - resume=false
__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # for HF remote code
__pycache__/__init__.cpython-310.pyc ADDED
Binary file (549 Bytes). View file
 
__pycache__/configuration_transformer.cpython-310.pyc ADDED
Binary file (1.99 kB). View file
 
__pycache__/modeling_transformer.cpython-310.pyc ADDED
Binary file (15.2 kB). View file
 
checkpoints/step-000000209715200.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:697d70565b110ccd3ada207ed7b21e56e216b17e2fffb9928d903bedfdb564c6
3
+ size 863640826
checkpoints/step-000000209715200.pt.done ADDED
File without changes
checkpoints/step-000000209715200.pt.keep ADDED
File without changes
checkpoints/step-000000419430400.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0340f67aa43135d0ab0d57b9d019c367b6967eb07129fedc32d254a565e59c02
3
+ size 863640826
checkpoints/step-000000419430400.pt.done ADDED
File without changes
checkpoints/step-000000419430400.pt.keep ADDED
File without changes
checkpoints/step-000000629145600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:261e4977576ea9d2abd1543a336220ddba150493734daacaddc8b6552f9d42fc
3
+ size 863640826
checkpoints/step-000000629145600.pt.done ADDED
File without changes
checkpoints/step-000000629145600.pt.keep ADDED
File without changes
checkpoints/step-000000838860800.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d72a3b7ca85ef8d4110165a0a139a62c9be846d383bc3ecf9a8f13ee9dcbde
3
+ size 863640826
checkpoints/step-000000838860800.pt.done ADDED
File without changes
checkpoints/step-000000838860800.pt.keep ADDED
File without changes
checkpoints/step-000001048576000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22830c44c675725d69e055da50e077fc4ee100810317b13d8be6c31716258ef1
3
+ size 863640826
checkpoints/step-000001048576000.pt.done ADDED
File without changes
checkpoints/step-000001048576000.pt.keep ADDED
File without changes
checkpoints/step-000001258291200.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:182e39f4302da45a9b8a6cd54bebdbcf1e186faf1e0060b4f76a496a5d1f5ed4
3
+ size 863640826
checkpoints/step-000001258291200.pt.done ADDED
File without changes
checkpoints/step-000001258291200.pt.keep ADDED
File without changes
checkpoints/step-000001468006400.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25f47ea6d55c9d7f7161e5b704fdc34772ffa72fbe5aafbf232a77486f120bd6
3
+ size 863640826
checkpoints/step-000001468006400.pt.done ADDED
File without changes
checkpoints/step-000001468006400.pt.keep ADDED
File without changes
checkpoints/step-000001677721600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c659e61f20c730f11df6bbb3fde2a910c06fb9cc4d539ad608f410e39f60e8e
3
+ size 863640826
checkpoints/step-000001677721600.pt.done ADDED
File without changes
checkpoints/step-000001677721600.pt.keep ADDED
File without changes
checkpoints/step-000001887436800.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72801b56885f8ac587582bc288ab8ad35f5268c1b0ec2224bdff1b679f2e63a6
3
+ size 863640826
checkpoints/step-000001887436800.pt.done ADDED
File without changes
checkpoints/step-000001887436800.pt.keep ADDED
File without changes
config.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM
3
+ config:
4
+ _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig
5
+ vocab_size: ???
6
+ hidden_size: 512
7
+ hidden_ratio: 4.0
8
+ intermediate_size: null
9
+ num_hidden_layers: 6
10
+ num_heads: 8
11
+ num_kv_heads: null
12
+ hidden_act: swish
13
+ window_size: null
14
+ max_position_embeddings: null
15
+ initializer_range: 0.02
16
+ elementwise_affine: true
17
+ norm_eps: 1.0e-06
18
+ use_cache: true
19
+ pad_token_id: null
20
+ bos_token_id: null
21
+ eos_token_id: null
22
+ tie_word_embeddings: false
23
+ attention_bias: false
24
+ fuse_norm: true
25
+ fuse_cross_entropy: true
26
+ rope_base: 500000.0
27
+ use_rope: false
28
+ use_output_gate: false
29
+ ogate_act: sigmoid
30
+ fgate_type: full
31
+ fgate_bias_init: false
32
+ decay_time_min: null
33
+ decay_time_max: null
34
+ use_output_norm: false
35
+ qk_norm: false
36
+ qk_norm_share_param_across_head: false
37
+ use_k_shift: false
38
+ use_v_shift: false
39
+ optimizer:
40
+ _target_: torch.optim.AdamW
41
+ lr: 0.001
42
+ betas:
43
+ - 0.9
44
+ - 0.95
45
+ weight_decay: 0.1
46
+ schedule:
47
+ _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule
48
+ init_value: 0.0
49
+ peak_value: 0.001
50
+ warmup_steps: 20971520
51
+ decay_steps: 2097152000
52
+ end_value: 0.0
53
+ datamodule:
54
+ _target_: forgetting_transformer.datamodule.npy.NpyDataModule
55
+ data_path: /workspace/forgetting-transformer/data
56
+ rank: ???
57
+ world_size: ???
58
+ train_batch_len: 2048
59
+ train_batch_size: 1024
60
+ train_num_workers: 0
61
+ eval_tokens: 2147483648
62
+ eval_batch_len: 2048
63
+ eval_local_batch_size: 1
64
+ eval_num_workers: 0
65
+ strategy:
66
+ _target_: lightning.fabric.strategies.SingleDeviceStrategy
67
+ device: cuda:0
68
+ exp: forgetting_gate_6_8_512
69
+ tag: forgetting_gate_6_8_512
70
+ seed: 42
71
+ hf_load_dir: null
72
+ hf_save_dir: null
73
+ hf_load_step: null
74
+ output_dir: /workspace/forgetting-transformer/forgetting_gate_6_8_512
75
+ data_dir: /workspace/forgetting-transformer/data
76
+ resume: false
77
+ fork_dir: null
78
+ fork_step: null
79
+ log_interval: 20971520
80
+ eval_interval: 41943040
81
+ final_eval: true
82
+ skip_eval: false
83
+ checkpoint_interval: 209715200
84
+ train_eval_interval: 104857600
85
+ checkpoint_keep_interval: 209715200
86
+ fabric:
87
+ devices: 1
88
+ precision: 16-mixed
89
+ train:
90
+ max_tokens: 2097152000
91
+ grad_acc_tokens: 32768
92
+ max_grad_norm: 1.0
93
+ gradient_checkpointing: true
94
+ bias_weight_decay: false
95
+ normalization_weight_decay: false
96
+ conv_weight_decay: true
97
+ eval:
98
+ min_val_length: 512
99
+ wandb:
100
+ project: forgetting-transformer
101
+ mode: online
102
+ log_dir: ./output/wandb
configuration_transformer.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Optional
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+
8
+ class TransformerConfig(PretrainedConfig):
9
+
10
+ model_type = 'transformer-project_fox'
11
+ keys_to_ignore_at_inference = ['past_key_values']
12
+
13
+ def __init__(
14
+ self,
15
+ vocab_size: int = 32000,
16
+ hidden_size: int = 2048,
17
+ hidden_ratio: Optional[int] = 4,
18
+ intermediate_size: Optional[int] = None,
19
+ num_hidden_layers: int = 24,
20
+ num_heads: int = 32,
21
+ num_kv_heads: int = None,
22
+ hidden_act: str = "swish",
23
+ window_size: Optional[int] = None,
24
+ max_position_embeddings: int = 2048,
25
+ initializer_range: float = 0.02,
26
+ elementwise_affine: Optional[bool] = True,
27
+ norm_eps: float = 1e-6,
28
+ use_cache: bool = True,
29
+ pad_token_id: int = None,
30
+ bos_token_id: int = 1,
31
+ eos_token_id: int = 2,
32
+ tie_word_embeddings: bool = False,
33
+ attention_bias: bool = False,
34
+ fuse_norm: bool = True,
35
+ fuse_cross_entropy: bool = True,
36
+ rope_base: float = 500000.0,
37
+ use_rope: bool = True,
38
+ **kwargs,
39
+ ):
40
+ self.vocab_size = vocab_size
41
+ self.hidden_size = hidden_size
42
+ self.hidden_ratio = hidden_ratio
43
+ self.intermediate_size = intermediate_size
44
+ self.num_hidden_layers = num_hidden_layers
45
+ self.num_heads = num_heads
46
+ self.num_kv_heads = num_kv_heads
47
+ self.window_size = window_size
48
+ self.max_position_embeddings = max_position_embeddings
49
+
50
+ self.hidden_act = hidden_act
51
+ self.initializer_range = initializer_range
52
+ self.elementwise_affine = elementwise_affine
53
+ self.norm_eps = norm_eps
54
+ self.use_cache = use_cache
55
+ self.attention_bias = attention_bias
56
+ self.fuse_cross_entropy = fuse_cross_entropy
57
+ self.fuse_norm = fuse_norm
58
+ self.rope_base = rope_base
59
+ self.use_rope = use_rope
60
+
61
+ super().__init__(
62
+ pad_token_id=pad_token_id,
63
+ bos_token_id=bos_token_id,
64
+ eos_token_id=eos_token_id,
65
+ tie_word_embeddings=tie_word_embeddings,
66
+ **kwargs,
67
+ )
decay_params.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _forward_module.model.embeddings.weight
2
+ _forward_module.model.layers.0.attn.q_proj.weight
3
+ _forward_module.model.layers.0.attn.k_proj.weight
4
+ _forward_module.model.layers.0.attn.v_proj.weight
5
+ _forward_module.model.layers.0.attn.o_proj.weight
6
+ _forward_module.model.layers.0.attn.fgate_proj.weight
7
+ _forward_module.model.layers.0.mlp.gate_proj.weight
8
+ _forward_module.model.layers.0.mlp.down_proj.weight
9
+ _forward_module.model.layers.1.attn.q_proj.weight
10
+ _forward_module.model.layers.1.attn.k_proj.weight
11
+ _forward_module.model.layers.1.attn.v_proj.weight
12
+ _forward_module.model.layers.1.attn.o_proj.weight
13
+ _forward_module.model.layers.1.attn.fgate_proj.weight
14
+ _forward_module.model.layers.1.mlp.gate_proj.weight
15
+ _forward_module.model.layers.1.mlp.down_proj.weight
16
+ _forward_module.model.layers.2.attn.q_proj.weight
17
+ _forward_module.model.layers.2.attn.k_proj.weight
18
+ _forward_module.model.layers.2.attn.v_proj.weight
19
+ _forward_module.model.layers.2.attn.o_proj.weight
20
+ _forward_module.model.layers.2.attn.fgate_proj.weight
21
+ _forward_module.model.layers.2.mlp.gate_proj.weight
22
+ _forward_module.model.layers.2.mlp.down_proj.weight
23
+ _forward_module.model.layers.3.attn.q_proj.weight
24
+ _forward_module.model.layers.3.attn.k_proj.weight
25
+ _forward_module.model.layers.3.attn.v_proj.weight
26
+ _forward_module.model.layers.3.attn.o_proj.weight
27
+ _forward_module.model.layers.3.attn.fgate_proj.weight
28
+ _forward_module.model.layers.3.mlp.gate_proj.weight
29
+ _forward_module.model.layers.3.mlp.down_proj.weight
30
+ _forward_module.model.layers.4.attn.q_proj.weight
31
+ _forward_module.model.layers.4.attn.k_proj.weight
32
+ _forward_module.model.layers.4.attn.v_proj.weight
33
+ _forward_module.model.layers.4.attn.o_proj.weight
34
+ _forward_module.model.layers.4.attn.fgate_proj.weight
35
+ _forward_module.model.layers.4.mlp.gate_proj.weight
36
+ _forward_module.model.layers.4.mlp.down_proj.weight
37
+ _forward_module.model.layers.5.attn.q_proj.weight
38
+ _forward_module.model.layers.5.attn.k_proj.weight
39
+ _forward_module.model.layers.5.attn.v_proj.weight
40
+ _forward_module.model.layers.5.attn.o_proj.weight
41
+ _forward_module.model.layers.5.attn.fgate_proj.weight
42
+ _forward_module.model.layers.5.mlp.gate_proj.weight
43
+ _forward_module.model.layers.5.mlp.down_proj.weight
44
+ _forward_module.lm_head.weight
logs/2025-10-17_13-25-22.log ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-10-17 13:25:22][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/forgetting_gate_6_8_512`
2
+ [2025-10-17 13:25:22][train:375][INFO] Configuration:
3
+ [2025-10-17 13:25:22][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/config.yaml.
4
+ [2025-10-17 13:25:22][train:387][INFO] creating datamodule
5
+ [2025-10-17 13:25:22][train:419][INFO] creating model
6
+ [2025-10-17 13:25:22][train:440][INFO] creating optimizer
7
+ [2025-10-17 13:25:22][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
8
+ [2025-10-17 13:25:22][logger:256][INFO] Setting up wandb logger...
9
+ [2025-10-17 13:25:22][logger:272][INFO] Not resuming. Creating a new wandb run.
10
+ [2025-10-17 13:25:23][logger:288][INFO] wandb initialized. Run id: y8zione3
11
+ [2025-10-17 13:25:23][logger:186][INFO] Setting up jsonlines logger...
12
+ [2025-10-17 13:25:23][logger:113][INFO] Setting up npz logger...
13
+ [2025-10-17 13:25:23][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
14
+ [2025-10-17 13:25:23][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
15
+ [2025-10-17 13:25:23][logger:171][INFO] [step: 0] [model_info/total_params: 71962160] [model_info/trainable_params: 71962160] [model_info/embedding_params: 25741824] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 46220336]
16
+ [2025-10-17 13:27:00][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:01:37] [ETA: 2:40:22] [loss: 8.766] [tokens/s: 224397.037] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
17
+ [2025-10-17 13:28:34][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:03:10] [ETA: 2:35:38] [loss: 7.474] [tokens/s: 224495.115] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
18
+ [2025-10-17 13:28:34][train:194][INFO] Running validation...
19
+ [2025-10-17 13:31:24][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 190.580] [val/train_update_time: 190.214] [val/loss: 7.451] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.258] [val/val_tokens_per_second: 240575.655] [val/loss_avg_len_2048: 7.451] [val/perplexity_len_2048: 1721.369] [val/loss_avg_len_1024: 7.449] [val/perplexity_len_1024: 1718.729] [val/loss_avg_len_512: 7.450] [val/perplexity_len_512: 1719.848]
20
+ [2025-10-17 13:32:57][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:07:34] [ETA: 4:04:46] [loss: 7.090] [tokens/s: 137862.779] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000]
21
+ [2025-10-17 13:34:31][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:09:07] [ETA: 3:39:02] [loss: 6.745] [tokens/s: 153012.985] [batches/s: 0.073] [MFU: 0.000] [TFLOPS: 0.000]
22
+ [2025-10-17 13:34:31][train:194][INFO] Running validation...
23
+ [2025-10-17 13:37:21][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 547.612] [val/train_update_time: 376.676] [val/loss: 6.732] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.363] [val/val_tokens_per_second: 240427.489] [val/loss_avg_len_2048: 6.732] [val/perplexity_len_2048: 839.103] [val/loss_avg_len_1024: 6.732] [val/perplexity_len_1024: 838.523] [val/loss_avg_len_512: 6.734] [val/perplexity_len_512: 840.603]
24
+ [2025-10-17 13:38:54][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:13:31] [ETA: 4:16:54] [loss: 6.485] [tokens/s: 128739.931] [batches/s: 0.061] [MFU: 0.000] [TFLOPS: 0.000]
25
+ [2025-10-17 13:38:54][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 811.291] [train_eval/train_update_time: 469.829] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 7.719] [train_eval/perplexity_len_2048: 2250.922] [train_eval/loss_avg_len_1024: 7.720] [train_eval/perplexity_len_1024: 2251.843] [train_eval/loss_avg_len_512: 7.719] [train_eval/perplexity_len_512: 2251.557]
26
+ [2025-10-17 13:40:28][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:15:04] [ETA: 3:56:12] [loss: 6.212] [tokens/s: 138789.400] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000]
27
+ [2025-10-17 13:40:28][train:194][INFO] Running validation...
28
+ [2025-10-17 13:43:17][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 904.598] [val/train_update_time: 562.983] [val/loss: 6.203] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.996] [val/val_tokens_per_second: 242372.209] [val/loss_avg_len_2048: 6.203] [val/perplexity_len_2048: 494.013] [val/loss_avg_len_1024: 6.204] [val/perplexity_len_1024: 494.586] [val/loss_avg_len_512: 6.209] [val/perplexity_len_512: 497.257]
29
+ [2025-10-17 13:44:50][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:19:27] [ETA: 4:18:24] [loss: 6.016] [tokens/s: 125399.720] [batches/s: 0.060] [MFU: 0.000] [TFLOPS: 0.000]
30
+ [2025-10-17 13:46:24][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:21:00] [ETA: 4:01:34] [loss: 5.953] [tokens/s: 132828.780] [batches/s: 0.063] [MFU: 0.000] [TFLOPS: 0.000]
31
+ [2025-10-17 13:46:24][train:194][INFO] Running validation...
32
+ [2025-10-17 13:49:12][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 1260.372] [val/train_update_time: 749.470] [val/loss: 5.855] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.650] [val/val_tokens_per_second: 242870.421] [val/loss_avg_len_2048: 5.855] [val/perplexity_len_2048: 348.901] [val/loss_avg_len_1024: 5.857] [val/perplexity_len_1024: 349.847] [val/loss_avg_len_512: 5.866] [val/perplexity_len_512: 352.803]
33
+ [2025-10-17 13:50:46][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:25:22] [ETA: 4:16:33] [loss: 5.676] [tokens/s: 123657.351] [batches/s: 0.059] [MFU: 0.000] [TFLOPS: 0.000]
34
+ [2025-10-17 13:52:19][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:26:55] [ETA: 4:02:22] [loss: 5.544] [tokens/s: 129541.235] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
35
+ [2025-10-17 13:52:19][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1615.807] [train_eval/train_update_time: 935.960] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.958] [train_eval/perplexity_len_2048: 386.812] [train_eval/loss_avg_len_1024: 5.962] [train_eval/perplexity_len_1024: 388.314] [train_eval/loss_avg_len_512: 5.967] [train_eval/perplexity_len_512: 390.486]
36
+ [2025-10-17 13:52:19][train:194][INFO] Running validation...
37
+ [2025-10-17 13:55:09][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 1615.807] [val/train_update_time: 935.960] [val/loss: 5.550] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.848] [val/val_tokens_per_second: 241156.333] [val/loss_avg_len_2048: 5.550] [val/perplexity_len_2048: 257.167] [val/loss_avg_len_1024: 5.555] [val/perplexity_len_1024: 258.430] [val/loss_avg_len_512: 5.566] [val/perplexity_len_512: 261.382]
38
+ [2025-10-17 13:55:09][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000209715200.pt...
39
+ [2025-10-17 13:55:10][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000209715200.pt.
40
+ [2025-10-17 13:55:10][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 1.537]
41
+ [2025-10-17 13:56:44][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:31:20] [ETA: 4:13:34] [loss: 5.430] [tokens/s: 117039.835] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
42
+ [2025-10-17 13:58:17][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:32:53] [ETA: 4:01:15] [loss: 5.281] [tokens/s: 129451.197] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
43
+ [2025-10-17 13:58:17][train:194][INFO] Running validation...
44
+ [2025-10-17 14:01:07][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1973.999] [val/train_update_time: 1122.469] [val/loss: 5.291] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.914] [val/val_tokens_per_second: 241063.779] [val/loss_avg_len_2048: 5.291] [val/perplexity_len_2048: 198.547] [val/loss_avg_len_1024: 5.298] [val/perplexity_len_1024: 199.923] [val/loss_avg_len_512: 5.312] [val/perplexity_len_512: 202.815]
45
+ [2025-10-17 14:02:40][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:37:17] [ETA: 4:09:32] [loss: 5.194] [tokens/s: 117053.423] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
46
+ [2025-10-17 14:04:14][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:38:50] [ETA: 3:58:38] [loss: 5.120] [tokens/s: 129470.825] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
47
+ [2025-10-17 14:04:14][train:194][INFO] Running validation...
48
+ [2025-10-17 14:07:03][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 2330.883] [val/train_update_time: 1309.140] [val/loss: 5.083] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.715] [val/val_tokens_per_second: 242776.387] [val/loss_avg_len_2048: 5.083] [val/perplexity_len_2048: 161.221] [val/loss_avg_len_1024: 5.092] [val/perplexity_len_1024: 162.644] [val/loss_avg_len_512: 5.109] [val/perplexity_len_512: 165.552]
49
+ [2025-10-17 14:08:36][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:43:13] [ETA: 4:04:54] [loss: 4.975] [tokens/s: 117131.159] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
50
+ [2025-10-17 14:08:36][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2593.153] [train_eval/train_update_time: 1402.553] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.251] [train_eval/perplexity_len_2048: 190.849] [train_eval/loss_avg_len_1024: 5.258] [train_eval/perplexity_len_1024: 192.163] [train_eval/loss_avg_len_512: 5.272] [train_eval/perplexity_len_512: 194.825]
51
+ [2025-10-17 14:10:10][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:44:46] [ETA: 3:55:06] [loss: 4.917] [tokens/s: 129440.330] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
52
+ [2025-10-17 14:10:10][train:194][INFO] Running validation...
53
+ [2025-10-17 14:12:59][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 2686.886] [val/train_update_time: 1496.139] [val/loss: 4.900] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.974] [val/val_tokens_per_second: 242404.252] [val/loss_avg_len_2048: 4.900] [val/perplexity_len_2048: 134.324] [val/loss_avg_len_1024: 4.911] [val/perplexity_len_1024: 135.834] [val/loss_avg_len_512: 4.933] [val/perplexity_len_512: 138.750]
54
+ [2025-10-17 14:14:33][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:49:09] [ETA: 3:59:59] [loss: 4.807] [tokens/s: 117100.911] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
55
+ [2025-10-17 14:16:06][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:50:42] [ETA: 3:51:02] [loss: 4.759] [tokens/s: 129387.893] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
56
+ [2025-10-17 14:16:06][train:194][INFO] Running validation...
57
+ [2025-10-17 14:18:55][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 3042.959] [val/train_update_time: 1682.947] [val/loss: 4.740] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.961] [val/val_tokens_per_second: 242422.916] [val/loss_avg_len_2048: 4.740] [val/perplexity_len_2048: 114.410] [val/loss_avg_len_1024: 4.754] [val/perplexity_len_1024: 116.060] [val/loss_avg_len_512: 4.781] [val/perplexity_len_512: 119.175]
58
+ [2025-10-17 14:20:29][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:55:05] [ETA: 3:54:51] [loss: 4.722] [tokens/s: 117061.568] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
59
+ [2025-10-17 14:22:02][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:56:39] [ETA: 3:46:36] [loss: 4.612] [tokens/s: 129558.046] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
60
+ [2025-10-17 14:22:02][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3399.030] [train_eval/train_update_time: 1869.779] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.790] [train_eval/perplexity_len_2048: 120.269] [train_eval/loss_avg_len_1024: 4.804] [train_eval/perplexity_len_1024: 121.974] [train_eval/loss_avg_len_512: 4.828] [train_eval/perplexity_len_512: 125.018]
61
+ [2025-10-17 14:22:02][train:194][INFO] Running validation...
62
+ [2025-10-17 14:24:52][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 3399.030] [val/train_update_time: 1869.779] [val/loss: 4.606] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.336] [val/val_tokens_per_second: 241886.560] [val/loss_avg_len_2048: 4.606] [val/perplexity_len_2048: 100.052] [val/loss_avg_len_1024: 4.625] [val/perplexity_len_1024: 102.018] [val/loss_avg_len_512: 4.660] [val/perplexity_len_512: 105.621]
63
+ [2025-10-17 14:24:52][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000419430400.pt...
64
+ [2025-10-17 14:24:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000419430400.pt.
65
+ [2025-10-17 14:24:53][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 1.573]
66
+ [2025-10-17 14:26:27][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 1:01:03] [ETA: 3:49:41] [loss: 4.519] [tokens/s: 117061.899] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
67
+ [2025-10-17 14:28:00][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 1:02:36] [ETA: 3:41:59] [loss: 4.450] [tokens/s: 129465.568] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
68
+ [2025-10-17 14:28:00][train:194][INFO] Running validation...
69
+ [2025-10-17 14:30:49][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 3756.890] [val/train_update_time: 2056.446] [val/loss: 4.443] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.172] [val/val_tokens_per_second: 242121.101] [val/loss_avg_len_2048: 4.443] [val/perplexity_len_2048: 85.004] [val/loss_avg_len_1024: 4.471] [val/perplexity_len_1024: 87.458] [val/loss_avg_len_512: 4.518] [val/perplexity_len_512: 91.669]
70
+ [2025-10-17 14:32:23][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 1:06:59] [ETA: 3:44:16] [loss: 4.342] [tokens/s: 117110.986] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
71
+ [2025-10-17 14:33:56][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 1:08:32] [ETA: 3:37:04] [loss: 4.284] [tokens/s: 129434.957] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
72
+ [2025-10-17 14:33:56][train:194][INFO] Running validation...
73
+ [2025-10-17 14:36:46][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 4112.965] [val/train_update_time: 2243.070] [val/loss: 4.284] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.933] [val/val_tokens_per_second: 241036.785] [val/loss_avg_len_2048: 4.284] [val/perplexity_len_2048: 72.514] [val/loss_avg_len_1024: 4.324] [val/perplexity_len_1024: 75.494] [val/loss_avg_len_512: 4.386] [val/perplexity_len_512: 80.310]
74
+ [2025-10-17 14:38:20][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 1:12:56] [ETA: 3:38:49] [loss: 4.202] [tokens/s: 117042.739] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
75
+ [2025-10-17 14:38:20][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4376.394] [train_eval/train_update_time: 2336.412] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.402] [train_eval/perplexity_len_2048: 81.624] [train_eval/loss_avg_len_1024: 4.431] [train_eval/perplexity_len_1024: 84.024] [train_eval/loss_avg_len_512: 4.480] [train_eval/perplexity_len_512: 88.247]
76
+ [2025-10-17 14:39:53][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 1:14:30] [ETA: 3:32:02] [loss: 4.142] [tokens/s: 129367.533] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
77
+ [2025-10-17 14:39:53][train:194][INFO] Running validation...
78
+ [2025-10-17 14:42:44][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 4470.064] [val/train_update_time: 2429.937] [val/loss: 4.142] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.504] [val/val_tokens_per_second: 240229.366] [val/loss_avg_len_2048: 4.142] [val/perplexity_len_2048: 62.948] [val/loss_avg_len_1024: 4.191] [val/perplexity_len_1024: 66.065] [val/loss_avg_len_512: 4.262] [val/perplexity_len_512: 70.971]
79
+ [2025-10-17 14:44:17][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 1:18:54] [ETA: 3:33:19] [loss: 4.100] [tokens/s: 116945.670] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
80
+ [2025-10-17 14:45:51][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 1:20:27] [ETA: 3:26:54] [loss: 4.009] [tokens/s: 129229.881] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
81
+ [2025-10-17 14:45:51][train:194][INFO] Running validation...
82
+ [2025-10-17 14:48:41][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 4827.831] [val/train_update_time: 2616.888] [val/loss: 4.030] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.519] [val/val_tokens_per_second: 241625.372] [val/loss_avg_len_2048: 4.030] [val/perplexity_len_2048: 56.278] [val/loss_avg_len_1024: 4.085] [val/perplexity_len_1024: 59.439] [val/loss_avg_len_512: 4.164] [val/perplexity_len_512: 64.319]
83
+ [2025-10-17 14:50:14][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 1:24:50] [ETA: 3:27:44] [loss: 3.976] [tokens/s: 116893.693] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
84
+ [2025-10-17 14:51:48][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 1:26:24] [ETA: 3:21:37] [loss: 3.923] [tokens/s: 129324.004] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
85
+ [2025-10-17 14:51:48][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5184.687] [train_eval/train_update_time: 2803.937] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.067] [train_eval/perplexity_len_2048: 58.370] [train_eval/loss_avg_len_1024: 4.115] [train_eval/perplexity_len_1024: 61.243] [train_eval/loss_avg_len_512: 4.189] [train_eval/perplexity_len_512: 65.979]
86
+ [2025-10-17 14:51:48][train:194][INFO] Running validation...
87
+ [2025-10-17 14:54:39][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 5184.687] [val/train_update_time: 2803.937] [val/loss: 3.937] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.900] [val/val_tokens_per_second: 239671.834] [val/loss_avg_len_2048: 3.937] [val/perplexity_len_2048: 51.259] [val/loss_avg_len_1024: 3.996] [val/perplexity_len_1024: 54.360] [val/loss_avg_len_512: 4.079] [val/perplexity_len_512: 59.094]
88
+ [2025-10-17 14:54:39][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000629145600.pt...
89
+ [2025-10-17 14:54:40][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000629145600.pt.
90
+ [2025-10-17 14:54:40][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 1.536]
91
+ [2025-10-17 14:56:14][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 1:30:50] [ETA: 3:22:12] [loss: 3.941] [tokens/s: 116777.554] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
92
+ [2025-10-17 14:57:47][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 1:32:23] [ETA: 3:16:20] [loss: 3.851] [tokens/s: 129069.442] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
93
+ [2025-10-17 14:57:47][train:194][INFO] Running validation...
94
+ [2025-10-17 15:00:36][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 5543.976] [val/train_update_time: 2990.484] [val/loss: 3.873] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.915] [val/val_tokens_per_second: 242488.330] [val/loss_avg_len_2048: 3.873] [val/perplexity_len_2048: 48.090] [val/loss_avg_len_1024: 3.936] [val/perplexity_len_1024: 51.188] [val/loss_avg_len_512: 4.023] [val/perplexity_len_512: 55.892]
95
+ [2025-10-17 15:02:09][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:36:46] [ETA: 3:16:28] [loss: 3.870] [tokens/s: 116818.422] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
96
+ [2025-10-17 15:03:43][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:38:19] [ETA: 3:10:52] [loss: 3.811] [tokens/s: 129163.988] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
97
+ [2025-10-17 15:03:43][train:194][INFO] Running validation...
98
+ [2025-10-17 15:06:32][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 5899.682] [val/train_update_time: 3176.984] [val/loss: 3.807] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.984] [val/val_tokens_per_second: 242390.207] [val/loss_avg_len_2048: 3.807] [val/perplexity_len_2048: 44.993] [val/loss_avg_len_1024: 3.871] [val/perplexity_len_1024: 47.998] [val/loss_avg_len_512: 3.962] [val/perplexity_len_512: 52.542]
99
+ [2025-10-17 15:08:05][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:42:42] [ETA: 3:10:44] [loss: 3.776] [tokens/s: 116870.243] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
100
+ [2025-10-17 15:08:05][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6162.256] [train_eval/train_update_time: 3270.439] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.853] [train_eval/perplexity_len_2048: 47.149] [train_eval/loss_avg_len_1024: 3.912] [train_eval/perplexity_len_1024: 50.006] [train_eval/loss_avg_len_512: 3.999] [train_eval/perplexity_len_512: 54.532]
101
+ [2025-10-17 15:09:39][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:44:15] [ETA: 3:05:21] [loss: 3.728] [tokens/s: 129273.407] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
102
+ [2025-10-17 15:09:39][train:194][INFO] Running validation...
103
+ [2025-10-17 15:12:28][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 6255.944] [val/train_update_time: 3363.994] [val/loss: 3.765] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.211] [val/val_tokens_per_second: 242064.373] [val/loss_avg_len_2048: 3.765] [val/perplexity_len_2048: 43.158] [val/loss_avg_len_1024: 3.831] [val/perplexity_len_1024: 46.130] [val/loss_avg_len_512: 3.923] [val/perplexity_len_512: 50.564]
104
+ [2025-10-17 15:14:02][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:48:38] [ETA: 3:04:59] [loss: 3.727] [tokens/s: 116957.665] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
105
+ [2025-10-17 15:15:35][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:50:12] [ETA: 2:59:48] [loss: 3.700] [tokens/s: 129322.966] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
106
+ [2025-10-17 15:15:35][train:194][INFO] Running validation...
107
+ [2025-10-17 15:18:24][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 6612.111] [val/train_update_time: 3550.668] [val/loss: 3.712] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.164] [val/val_tokens_per_second: 242131.208] [val/loss_avg_len_2048: 3.712] [val/perplexity_len_2048: 40.924] [val/loss_avg_len_1024: 3.780] [val/perplexity_len_1024: 43.797] [val/loss_avg_len_512: 3.874] [val/perplexity_len_512: 48.126]
108
+ [2025-10-17 15:19:58][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:54:34] [ETA: 2:59:12] [loss: 3.699] [tokens/s: 117012.521] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
109
+ [2025-10-17 15:21:31][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:56:08] [ETA: 2:54:12] [loss: 3.581] [tokens/s: 129623.687] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
110
+ [2025-10-17 15:21:31][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6968.172] [train_eval/train_update_time: 3737.291] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.723] [train_eval/perplexity_len_2048: 41.377] [train_eval/loss_avg_len_1024: 3.785] [train_eval/perplexity_len_1024: 44.040] [train_eval/loss_avg_len_512: 3.879] [train_eval/perplexity_len_512: 48.372]
111
+ [2025-10-17 15:21:31][train:194][INFO] Running validation...
112
+ [2025-10-17 15:24:21][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 6968.172] [val/train_update_time: 3737.291] [val/loss: 3.682] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.108] [val/val_tokens_per_second: 240788.507] [val/loss_avg_len_2048: 3.682] [val/perplexity_len_2048: 39.729] [val/loss_avg_len_1024: 3.751] [val/perplexity_len_1024: 42.563] [val/loss_avg_len_512: 3.847] [val/perplexity_len_512: 46.837]
113
+ [2025-10-17 15:24:21][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000838860800.pt...
114
+ [2025-10-17 15:24:23][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000838860800.pt.
115
+ [2025-10-17 15:24:23][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 1.590]
116
+ [2025-10-17 15:25:56][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 2:00:33] [ETA: 2:53:28] [loss: 3.623] [tokens/s: 117076.347] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
117
+ [2025-10-17 15:27:30][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 2:02:06] [ETA: 2:48:38] [loss: 3.624] [tokens/s: 129380.861] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
118
+ [2025-10-17 15:27:30][train:194][INFO] Running validation...
119
+ [2025-10-17 15:30:21][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 7326.923] [val/train_update_time: 3924.072] [val/loss: 3.641] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.913] [val/val_tokens_per_second: 239654.332] [val/loss_avg_len_2048: 3.641] [val/perplexity_len_2048: 38.130] [val/loss_avg_len_1024: 3.710] [val/perplexity_len_1024: 40.872] [val/loss_avg_len_512: 3.807] [val/perplexity_len_512: 45.009]
120
+ [2025-10-17 15:31:54][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 2:06:31] [ETA: 2:47:42] [loss: 3.623] [tokens/s: 116917.429] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
121
+ [2025-10-17 15:33:28][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 2:08:04] [ETA: 2:43:00] [loss: 3.657] [tokens/s: 129213.526] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
122
+ [2025-10-17 15:33:28][train:194][INFO] Running validation...
123
+ [2025-10-17 15:36:18][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 7684.845] [val/train_update_time: 4110.783] [val/loss: 3.620] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.392] [val/val_tokens_per_second: 240386.300] [val/loss_avg_len_2048: 3.620] [val/perplexity_len_2048: 37.324] [val/loss_avg_len_1024: 3.689] [val/perplexity_len_1024: 40.005] [val/loss_avg_len_512: 3.786] [val/perplexity_len_512: 44.090]
124
+ [2025-10-17 15:37:52][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 2:12:28] [ETA: 2:41:55] [loss: 3.605] [tokens/s: 116818.084] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
125
+ [2025-10-17 15:37:52][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7948.908] [train_eval/train_update_time: 4204.297] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.631] [train_eval/perplexity_len_2048: 37.762] [train_eval/loss_avg_len_1024: 3.699] [train_eval/perplexity_len_1024: 40.392] [train_eval/loss_avg_len_512: 3.795] [train_eval/perplexity_len_512: 44.500]
126
+ [2025-10-17 15:39:26][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 2:14:02] [ETA: 2:37:21] [loss: 3.573] [tokens/s: 129107.341] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
127
+ [2025-10-17 15:39:26][train:194][INFO] Running validation...
128
+ [2025-10-17 15:42:15][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 8042.599] [val/train_update_time: 4297.832] [val/loss: 3.585] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.110] [val/val_tokens_per_second: 242208.947] [val/loss_avg_len_2048: 3.585] [val/perplexity_len_2048: 36.038] [val/loss_avg_len_1024: 3.656] [val/perplexity_len_1024: 38.693] [val/loss_avg_len_512: 3.753] [val/perplexity_len_512: 42.667]
129
+ [2025-10-17 15:43:48][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 2:18:25] [ETA: 2:36:05] [loss: 3.553] [tokens/s: 116819.368] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
130
+ [2025-10-17 15:45:22][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 2:19:58] [ETA: 2:31:38] [loss: 3.564] [tokens/s: 129099.142] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
131
+ [2025-10-17 15:45:22][train:194][INFO] Running validation...
132
+ [2025-10-17 15:48:12][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 8398.813] [val/train_update_time: 4484.626] [val/loss: 3.566] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.013] [val/val_tokens_per_second: 240922.848] [val/loss_avg_len_2048: 3.566] [val/perplexity_len_2048: 35.358] [val/loss_avg_len_1024: 3.637] [val/perplexity_len_1024: 37.974] [val/loss_avg_len_512: 3.736] [val/perplexity_len_512: 41.931]
133
+ [2025-10-17 15:49:45][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 2:24:22] [ETA: 2:30:15] [loss: 3.555] [tokens/s: 116751.372] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
134
+ [2025-10-17 15:51:19][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 2:25:55] [ETA: 2:25:55] [loss: 3.536] [tokens/s: 129215.693] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
135
+ [2025-10-17 15:51:19][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8755.968] [train_eval/train_update_time: 4671.454] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.567] [train_eval/perplexity_len_2048: 35.405] [train_eval/loss_avg_len_1024: 3.633] [train_eval/perplexity_len_1024: 37.841] [train_eval/loss_avg_len_512: 3.731] [train_eval/perplexity_len_512: 41.730]
136
+ [2025-10-17 15:51:19][train:194][INFO] Running validation...
137
+ [2025-10-17 15:54:09][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 8755.968] [val/train_update_time: 4671.454] [val/loss: 3.541] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.222] [val/val_tokens_per_second: 240627.488] [val/loss_avg_len_2048: 3.541] [val/perplexity_len_2048: 34.498] [val/loss_avg_len_1024: 3.612] [val/perplexity_len_1024: 37.051] [val/loss_avg_len_512: 3.711] [val/perplexity_len_512: 40.912]
138
+ [2025-10-17 15:54:09][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001048576000.pt...
139
+ [2025-10-17 15:54:11][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001048576000.pt.
140
+ [2025-10-17 15:54:11][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 1.610]
141
+ [2025-10-17 15:55:44][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 2:30:21] [ETA: 2:24:27] [loss: 3.505] [tokens/s: 116730.679] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000]
142
+ [2025-10-17 15:57:18][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 2:31:54] [ETA: 2:20:13] [loss: 3.520] [tokens/s: 129159.943] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000]
143
+ [2025-10-17 15:57:18][train:194][INFO] Running validation...
144
+ [2025-10-17 16:00:08][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 9114.626] [val/train_update_time: 4857.963] [val/loss: 3.524] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.630] [val/val_tokens_per_second: 240051.300] [val/loss_avg_len_2048: 3.524] [val/perplexity_len_2048: 33.919] [val/loss_avg_len_1024: 3.595] [val/perplexity_len_1024: 36.430] [val/loss_avg_len_512: 3.695] [val/perplexity_len_512: 40.238]
145
+ [2025-10-17 16:02:02][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 2:36:38] [ETA: 2:18:54] [loss: 3.498] [tokens/s: 115509.315] [batches/s: 0.055] [MFU: 0.000] [TFLOPS: 0.000]
146
+ [2025-10-17 16:05:29][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 2:40:06] [ETA: 2:16:22] [loss: 3.512] [tokens/s: 119216.261] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000]
147
+ [2025-10-17 16:05:29][train:194][INFO] Running validation...
148
+ [2025-10-17 16:09:02][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 9606.105] [val/train_update_time: 5178.119] [val/loss: 3.503] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 213.205] [val/val_tokens_per_second: 192115.546] [val/loss_avg_len_2048: 3.503] [val/perplexity_len_2048: 33.229] [val/loss_avg_len_1024: 3.576] [val/perplexity_len_1024: 35.717] [val/loss_avg_len_512: 3.676] [val/perplexity_len_512: 39.474]
149
+ [2025-10-17 16:12:07][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 2:46:43] [ETA: 2:16:24] [loss: 3.468] [tokens/s: 101518.231] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000]
150
+ [2025-10-17 16:12:07][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10003.422] [train_eval/train_update_time: 5362.020] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.510] [train_eval/perplexity_len_2048: 33.445] [train_eval/loss_avg_len_1024: 3.577] [train_eval/perplexity_len_1024: 35.773] [train_eval/loss_avg_len_512: 3.674] [train_eval/perplexity_len_512: 39.420]
151
+ [2025-10-17 16:14:34][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 2:49:10] [ETA: 2:12:55] [loss: 3.506] [tokens/s: 107603.473] [batches/s: 0.051] [MFU: 0.000] [TFLOPS: 0.000]
152
+ [2025-10-17 16:14:34][train:194][INFO] Running validation...
153
+ [2025-10-17 16:18:40][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 10150.525] [val/train_update_time: 5508.942] [val/loss: 3.486] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.225] [val/val_tokens_per_second: 166351.655] [val/loss_avg_len_2048: 3.486] [val/perplexity_len_2048: 32.655] [val/loss_avg_len_1024: 3.559] [val/perplexity_len_1024: 35.114] [val/loss_avg_len_512: 3.659] [val/perplexity_len_512: 38.821]
154
+ [2025-10-17 16:21:30][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 2:56:06] [ETA: 2:12:50] [loss: 3.430] [tokens/s: 92210.553] [batches/s: 0.044] [MFU: 0.000] [TFLOPS: 0.000]
155
+ [2025-10-17 16:24:44][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 2:59:20] [ETA: 2:09:52] [loss: 3.490] [tokens/s: 95132.421] [batches/s: 0.045] [MFU: 0.000] [TFLOPS: 0.000]
156
+ [2025-10-17 16:24:44][train:194][INFO] Running validation...
157
+ [2025-10-17 16:28:32][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 10760.647] [val/train_update_time: 5872.135] [val/loss: 3.472] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 227.817] [val/val_tokens_per_second: 179793.158] [val/loss_avg_len_2048: 3.472] [val/perplexity_len_2048: 32.191] [val/loss_avg_len_1024: 3.544] [val/perplexity_len_1024: 34.622] [val/loss_avg_len_512: 3.645] [val/perplexity_len_512: 38.292]
158
+ [2025-10-17 16:31:19][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 3:05:56] [ETA: 2:09:12] [loss: 3.506] [tokens/s: 83562.533] [batches/s: 0.040] [MFU: 0.000] [TFLOPS: 0.000]
159
+ [2025-10-17 16:33:54][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 3:08:30] [ETA: 2:05:40] [loss: 3.509] [tokens/s: 87483.728] [batches/s: 0.042] [MFU: 0.000] [TFLOPS: 0.000]
160
+ [2025-10-17 16:33:54][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 11310.366] [train_eval/train_update_time: 6193.654] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.469] [train_eval/perplexity_len_2048: 32.111] [train_eval/loss_avg_len_1024: 3.535] [train_eval/perplexity_len_1024: 34.294] [train_eval/loss_avg_len_512: 3.635] [train_eval/perplexity_len_512: 37.889]
161
+ [2025-10-17 16:33:54][train:194][INFO] Running validation...
162
+ [2025-10-17 16:37:52][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 11310.366] [val/train_update_time: 6193.654] [val/loss: 3.458] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 238.131] [val/val_tokens_per_second: 172006.003] [val/loss_avg_len_2048: 3.458] [val/perplexity_len_2048: 31.754] [val/loss_avg_len_1024: 3.531] [val/perplexity_len_1024: 34.160] [val/loss_avg_len_512: 3.632] [val/perplexity_len_512: 37.792]
163
+ [2025-10-17 16:37:52][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001258291200.pt...
164
+ [2025-10-17 16:37:54][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001258291200.pt.
165
+ [2025-10-17 16:37:54][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 1.537]
166
+ [2025-10-17 16:41:00][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 3:15:36] [ETA: 2:05:03] [loss: 3.467] [tokens/s: 76720.992] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
167
+ [2025-10-17 16:43:57][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 3:18:34] [ETA: 2:01:42] [loss: 3.441] [tokens/s: 79259.061] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000]
168
+ [2025-10-17 16:43:57][train:194][INFO] Running validation...
169
+ [2025-10-17 16:48:04][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 11914.081] [val/train_update_time: 6556.691] [val/loss: 3.444] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.788] [val/val_tokens_per_second: 165972.114] [val/loss_avg_len_2048: 3.444] [val/perplexity_len_2048: 31.324] [val/loss_avg_len_1024: 3.517] [val/perplexity_len_1024: 33.696] [val/loss_avg_len_512: 3.619] [val/perplexity_len_512: 37.285]
170
+ [2025-10-17 16:50:31][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 3:25:08] [ETA: 2:00:28] [loss: 3.455] [tokens/s: 71931.548] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
171
+ [2025-10-17 16:53:27][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 3:28:03] [ETA: 1:57:02] [loss: 3.447] [tokens/s: 78608.237] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
172
+ [2025-10-17 16:53:27][train:194][INFO] Running validation...
173
+ [2025-10-17 16:57:08][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 12483.778] [val/train_update_time: 6879.229] [val/loss: 3.434] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 220.318] [val/val_tokens_per_second: 185912.978] [val/loss_avg_len_2048: 3.434] [val/perplexity_len_2048: 30.985] [val/loss_avg_len_1024: 3.507] [val/perplexity_len_1024: 33.349] [val/loss_avg_len_512: 3.608] [val/perplexity_len_512: 36.904]
174
+ [2025-10-17 17:00:29][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 3:35:05] [ETA: 1:55:49] [loss: 3.410] [tokens/s: 71902.990] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
175
+ [2025-10-17 17:00:29][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 12905.641] [train_eval/train_update_time: 7080.244] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.438] [train_eval/perplexity_len_2048: 31.119] [train_eval/loss_avg_len_1024: 3.509] [train_eval/perplexity_len_1024: 33.403] [train_eval/loss_avg_len_512: 3.608] [train_eval/perplexity_len_512: 36.877]
176
+ [2025-10-17 17:03:12][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 3:37:48] [ETA: 1:52:12] [loss: 3.453] [tokens/s: 78132.660] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
177
+ [2025-10-17 17:03:12][train:194][INFO] Running validation...
178
+ [2025-10-17 17:07:17][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 13068.739] [val/train_update_time: 7242.851] [val/loss: 3.424] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 245.131] [val/val_tokens_per_second: 167094.665] [val/loss_avg_len_2048: 3.424] [val/perplexity_len_2048: 30.689] [val/loss_avg_len_1024: 3.498] [val/perplexity_len_1024: 33.044] [val/loss_avg_len_512: 3.599] [val/perplexity_len_512: 36.570]
179
+ [2025-10-17 17:09:44][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 3:44:21] [ETA: 1:50:30] [loss: 3.382] [tokens/s: 72309.301] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
180
+ [2025-10-17 17:12:55][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 3:47:31] [ETA: 1:47:04] [loss: 3.413] [tokens/s: 78647.123] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000]
181
+ [2025-10-17 17:12:55][train:194][INFO] Running validation...
182
+ [2025-10-17 17:16:21][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 13651.783] [val/train_update_time: 7580.372] [val/loss: 3.414] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 205.477] [val/val_tokens_per_second: 199341.303] [val/loss_avg_len_2048: 3.414] [val/perplexity_len_2048: 30.372] [val/loss_avg_len_1024: 3.487] [val/perplexity_len_1024: 32.694] [val/loss_avg_len_512: 3.589] [val/perplexity_len_512: 36.202]
183
+ [2025-10-17 17:19:55][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 3:54:31] [ETA: 1:45:21] [loss: 3.416] [tokens/s: 71588.559] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
184
+ [2025-10-17 17:22:25][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 3:57:01] [ETA: 1:41:34] [loss: 3.415] [tokens/s: 78156.638] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
185
+ [2025-10-17 17:22:25][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 14221.495] [train_eval/train_update_time: 7943.600] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.410] [train_eval/perplexity_len_2048: 30.263] [train_eval/loss_avg_len_1024: 3.481] [train_eval/perplexity_len_1024: 32.496] [train_eval/loss_avg_len_512: 3.582] [train_eval/perplexity_len_512: 35.944]
186
+ [2025-10-17 17:22:25][train:194][INFO] Running validation...
187
+ [2025-10-17 17:26:31][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 14221.495] [val/train_update_time: 7943.600] [val/loss: 3.405] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.159] [val/val_tokens_per_second: 166396.303] [val/loss_avg_len_2048: 3.405] [val/perplexity_len_2048: 30.103] [val/loss_avg_len_1024: 3.478] [val/perplexity_len_1024: 32.408] [val/loss_avg_len_512: 3.581] [val/perplexity_len_512: 35.893]
188
+ [2025-10-17 17:26:31][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001468006400.pt...
189
+ [2025-10-17 17:26:32][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001468006400.pt.
190
+ [2025-10-17 17:26:32][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 1.549]
191
+ [2025-10-17 17:28:59][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 4:03:36] [ETA: 1:39:29] [loss: 3.425] [tokens/s: 72699.850] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000]
192
+ [2025-10-17 17:32:25][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 4:07:01] [ETA: 1:36:03] [loss: 3.394] [tokens/s: 78465.256] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
193
+ [2025-10-17 17:32:25][train:194][INFO] Running validation...
194
+ [2025-10-17 17:35:39][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 14821.624] [val/train_update_time: 8295.639] [val/loss: 3.396] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 194.166] [val/val_tokens_per_second: 210954.038] [val/loss_avg_len_2048: 3.396] [val/perplexity_len_2048: 29.849] [val/loss_avg_len_1024: 3.470] [val/perplexity_len_1024: 32.135] [val/loss_avg_len_512: 3.572] [val/perplexity_len_512: 35.591]
195
+ [2025-10-17 17:39:16][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 4:13:52] [ETA: 1:33:54] [loss: 3.416] [tokens/s: 71341.749] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
196
+ [2025-10-17 17:41:43][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 4:16:20] [ETA: 1:30:03] [loss: 3.377] [tokens/s: 78020.639] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
197
+ [2025-10-17 17:41:43][train:194][INFO] Running validation...
198
+ [2025-10-17 17:45:50][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 15380.239] [val/train_update_time: 8659.388] [val/loss: 3.390] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.372] [val/val_tokens_per_second: 166252.603] [val/loss_avg_len_2048: 3.390] [val/perplexity_len_2048: 29.654] [val/loss_avg_len_1024: 3.464] [val/perplexity_len_1024: 31.933] [val/loss_avg_len_512: 3.566] [val/perplexity_len_512: 35.362]
199
+ [2025-10-17 17:48:17][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 4:22:53] [ETA: 1:27:37] [loss: 3.388] [tokens/s: 72991.067] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000]
200
+ [2025-10-17 17:48:17][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 15773.814] [train_eval/train_update_time: 8806.402] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.390] [train_eval/perplexity_len_2048: 29.661] [train_eval/loss_avg_len_1024: 3.460] [train_eval/perplexity_len_1024: 31.831] [train_eval/loss_avg_len_512: 3.562] [train_eval/perplexity_len_512: 35.234]
201
+ [2025-10-17 17:51:52][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 4:26:28] [ETA: 1:24:08] [loss: 3.337] [tokens/s: 78063.028] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
202
+ [2025-10-17 17:51:52][train:194][INFO] Running validation...
203
+ [2025-10-17 17:54:59][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 15988.253] [val/train_update_time: 9020.619] [val/loss: 3.383] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 186.935] [val/val_tokens_per_second: 219113.538] [val/loss_avg_len_2048: 3.383] [val/perplexity_len_2048: 29.460] [val/loss_avg_len_1024: 3.457] [val/perplexity_len_1024: 31.724] [val/loss_avg_len_512: 3.559] [val/perplexity_len_512: 35.144]
204
+ [2025-10-17 17:58:35][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 4:33:12] [ETA: 1:21:36] [loss: 3.408] [tokens/s: 71192.916] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
205
+ [2025-10-17 18:01:03][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 4:35:39] [ETA: 1:17:44] [loss: 3.354] [tokens/s: 77841.964] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
206
+ [2025-10-17 18:01:03][train:194][INFO] Running validation...
207
+ [2025-10-17 18:05:09][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 16539.340] [val/train_update_time: 9384.053] [val/loss: 3.378] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.441] [val/val_tokens_per_second: 166206.356] [val/loss_avg_len_2048: 3.378] [val/perplexity_len_2048: 29.302] [val/loss_avg_len_1024: 3.452] [val/perplexity_len_1024: 31.559] [val/loss_avg_len_512: 3.554] [val/perplexity_len_512: 34.959]
208
+ [2025-10-17 18:07:36][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 4:42:12] [ETA: 1:15:01] [loss: 3.369] [tokens/s: 73013.636] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000]
209
+ [2025-10-17 18:11:13][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 4:45:49] [ETA: 1:11:27] [loss: 3.356] [tokens/s: 77890.611] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
210
+ [2025-10-17 18:11:13][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 17149.438] [train_eval/train_update_time: 9747.288] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.374] [train_eval/perplexity_len_2048: 29.198] [train_eval/loss_avg_len_1024: 3.447] [train_eval/perplexity_len_1024: 31.396] [train_eval/loss_avg_len_512: 3.547] [train_eval/perplexity_len_512: 34.724]
211
+ [2025-10-17 18:11:13][train:194][INFO] Running validation...
212
+ [2025-10-17 18:14:24][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 17149.438] [val/train_update_time: 9747.288] [val/loss: 3.372] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 191.129] [val/val_tokens_per_second: 214305.318] [val/loss_avg_len_2048: 3.372] [val/perplexity_len_2048: 29.146] [val/loss_avg_len_1024: 3.446] [val/perplexity_len_1024: 31.388] [val/loss_avg_len_512: 3.549] [val/perplexity_len_512: 34.773]
213
+ [2025-10-17 18:14:24][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001677721600.pt...
214
+ [2025-10-17 18:14:25][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001677721600.pt.
215
+ [2025-10-17 18:14:25][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 1.572]
216
+ [2025-10-17 18:17:52][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 4:52:29] [ETA: 1:08:36] [loss: 3.336] [tokens/s: 71141.408] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
217
+ [2025-10-17 18:20:20][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 4:54:56] [ETA: 1:04:44] [loss: 3.345] [tokens/s: 77883.412] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
218
+ [2025-10-17 18:20:20][train:194][INFO] Running validation...
219
+ [2025-10-17 18:24:26][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 17696.359] [val/train_update_time: 10101.101] [val/loss: 3.368] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.331] [val/val_tokens_per_second: 166280.514] [val/loss_avg_len_2048: 3.368] [val/perplexity_len_2048: 29.029] [val/loss_avg_len_1024: 3.442] [val/perplexity_len_1024: 31.262] [val/loss_avg_len_512: 3.545] [val/perplexity_len_512: 34.640]
220
+ [2025-10-17 18:26:53][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 5:01:29] [ETA: 1:01:45] [loss: 3.380] [tokens/s: 73049.408] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000]
221
+ [2025-10-17 18:30:30][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 5:05:06] [ETA: 0:58:07] [loss: 3.310] [tokens/s: 77887.981] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
222
+ [2025-10-17 18:30:30][train:194][INFO] Running validation...
223
+ [2025-10-17 18:33:52][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 18306.999] [val/train_update_time: 10465.017] [val/loss: 3.365] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 201.660] [val/val_tokens_per_second: 203114.557] [val/loss_avg_len_2048: 3.365] [val/perplexity_len_2048: 28.921] [val/loss_avg_len_1024: 3.439] [val/perplexity_len_1024: 31.146] [val/loss_avg_len_512: 3.541] [val/perplexity_len_512: 34.510]
224
+ [2025-10-17 18:37:09][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 5:11:46] [ETA: 0:55:01] [loss: 3.388] [tokens/s: 71159.479] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
225
+ [2025-10-17 18:37:09][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 18706.207] [train_eval/train_update_time: 10662.364] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.357] [train_eval/perplexity_len_2048: 28.697] [train_eval/loss_avg_len_1024: 3.422] [train_eval/perplexity_len_1024: 30.643] [train_eval/loss_avg_len_512: 3.524] [train_eval/perplexity_len_512: 33.909]
226
+ [2025-10-17 18:39:37][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 5:14:13] [ETA: 0:51:09] [loss: 3.372] [tokens/s: 78216.845] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
227
+ [2025-10-17 18:39:37][train:194][INFO] Running validation...
228
+ [2025-10-17 18:43:41][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 18853.344] [val/train_update_time: 10809.319] [val/loss: 3.361] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 244.710] [val/val_tokens_per_second: 167382.117] [val/loss_avg_len_2048: 3.361] [val/perplexity_len_2048: 28.831] [val/loss_avg_len_1024: 3.436] [val/perplexity_len_1024: 31.054] [val/loss_avg_len_512: 3.538] [val/perplexity_len_512: 34.410]
229
+ [2025-10-17 18:46:18][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 5:20:54] [ETA: 0:47:57] [loss: 3.314] [tokens/s: 72916.286] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000]
230
+ [2025-10-17 18:49:45][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 5:24:22] [ETA: 0:44:13] [loss: 3.329] [tokens/s: 78002.026] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
231
+ [2025-10-17 18:49:45][train:194][INFO] Running validation...
232
+ [2025-10-17 18:53:17][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 19462.212] [val/train_update_time: 11172.773] [val/loss: 3.359] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 212.012] [val/val_tokens_per_second: 193196.951] [val/loss_avg_len_2048: 3.359] [val/perplexity_len_2048: 28.760] [val/loss_avg_len_1024: 3.433] [val/perplexity_len_1024: 30.976] [val/loss_avg_len_512: 3.536] [val/perplexity_len_512: 34.325]
233
+ [2025-10-17 18:56:21][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 5:30:58] [ETA: 0:40:54] [loss: 3.398] [tokens/s: 71332.528] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
234
+ [2025-10-17 18:58:48][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 5:33:24] [ETA: 0:37:02] [loss: 3.310] [tokens/s: 78663.114] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000]
235
+ [2025-10-17 18:58:48][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 20004.831] [train_eval/train_update_time: 11502.972] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.352] [train_eval/perplexity_len_2048: 28.574] [train_eval/loss_avg_len_1024: 3.421] [train_eval/perplexity_len_1024: 30.612] [train_eval/loss_avg_len_512: 3.523] [train_eval/perplexity_len_512: 33.884]
236
+ [2025-10-17 18:58:48][train:194][INFO] Running validation...
237
+ [2025-10-17 19:02:54][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 20004.831] [val/train_update_time: 11502.972] [val/loss: 3.357] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.356] [val/val_tokens_per_second: 166263.403] [val/loss_avg_len_2048: 3.357] [val/perplexity_len_2048: 28.702] [val/loss_avg_len_1024: 3.431] [val/perplexity_len_1024: 30.916] [val/loss_avg_len_512: 3.534] [val/perplexity_len_512: 34.260]
238
+ [2025-10-17 19:02:54][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001887436800.pt...
239
+ [2025-10-17 19:02:56][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001887436800.pt.
240
+ [2025-10-17 19:02:56][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 1.561]
241
+ [2025-10-17 19:05:47][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 5:40:24] [ETA: 0:33:39] [loss: 3.360] [tokens/s: 72590.713] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000]
242
+ [2025-10-17 19:09:00][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 5:43:36] [ETA: 0:29:52] [loss: 3.372] [tokens/s: 78076.895] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
243
+ [2025-10-17 19:09:00][train:194][INFO] Running validation...
244
+ [2025-10-17 19:12:48][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 20616.579] [val/train_update_time: 11866.318] [val/loss: 3.355] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 228.491] [val/val_tokens_per_second: 179262.736] [val/loss_avg_len_2048: 3.355] [val/perplexity_len_2048: 28.659] [val/loss_avg_len_1024: 3.430] [val/perplexity_len_1024: 30.869] [val/loss_avg_len_512: 3.532] [val/perplexity_len_512: 34.208]
245
+ [2025-10-17 19:15:34][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 5:50:10] [ETA: 0:26:21] [loss: 3.365] [tokens/s: 71631.164] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
246
+ [2025-10-17 19:18:10][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 5:52:46] [ETA: 0:22:31] [loss: 3.320] [tokens/s: 78793.199] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000]
247
+ [2025-10-17 19:18:10][train:194][INFO] Running validation...
248
+ [2025-10-17 19:22:07][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 21166.987] [val/train_update_time: 12187.850] [val/loss: 3.354] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 236.168] [val/val_tokens_per_second: 173436.197] [val/loss_avg_len_2048: 3.354] [val/perplexity_len_2048: 28.631] [val/loss_avg_len_1024: 3.429] [val/perplexity_len_1024: 30.840] [val/loss_avg_len_512: 3.531] [val/perplexity_len_512: 34.175]
249
+ [2025-10-17 19:25:13][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 5:59:49] [ETA: 0:18:56] [loss: 3.327] [tokens/s: 72375.858] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000]
250
+ [2025-10-17 19:25:13][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 21589.576] [train_eval/train_update_time: 12373.759] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.342] [train_eval/perplexity_len_2048: 28.287] [train_eval/loss_avg_len_1024: 3.414] [train_eval/perplexity_len_1024: 30.397] [train_eval/loss_avg_len_512: 3.514] [train_eval/perplexity_len_512: 33.585]
251
+ [2025-10-17 19:28:11][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 6:02:47] [ETA: 0:15:06] [loss: 3.331] [tokens/s: 78202.226] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000]
252
+ [2025-10-17 19:28:11][train:194][INFO] Running validation...
253
+ [2025-10-17 19:32:16][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 21767.690] [val/train_update_time: 12551.348] [val/loss: 3.354] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 244.702] [val/val_tokens_per_second: 167386.972] [val/loss_avg_len_2048: 3.354] [val/perplexity_len_2048: 28.614] [val/loss_avg_len_1024: 3.428] [val/perplexity_len_1024: 30.821] [val/loss_avg_len_512: 3.531] [val/perplexity_len_512: 34.155]
254
+ [2025-10-17 19:34:43][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 6:09:19] [ETA: 0:11:25] [loss: 3.359] [tokens/s: 72048.637] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000]
255
+ [2025-10-17 19:37:39][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 6:12:15] [ETA: 0:07:35] [loss: 3.334] [tokens/s: 78695.239] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000]
256
+ [2025-10-17 19:37:39][train:194][INFO] Running validation...
257
+ [2025-10-17 19:41:19][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 22335.749] [val/train_update_time: 12874.330] [val/loss: 3.354] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 220.122] [val/val_tokens_per_second: 186078.880] [val/loss_avg_len_2048: 3.354] [val/perplexity_len_2048: 28.606] [val/loss_avg_len_1024: 3.428] [val/perplexity_len_1024: 30.812] [val/loss_avg_len_512: 3.531] [val/perplexity_len_512: 34.145]
258
+ [2025-10-17 19:41:19][train:854][INFO] Training finished with 2055208960 tokens!
metrics/jsonlines/checkpoint.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 209715200, "checkpoint/checkpoint_time": 1.5373147319769487}
2
+ {"step": 419430400, "checkpoint/checkpoint_time": 1.572724198922515}
3
+ {"step": 629145600, "checkpoint/checkpoint_time": 1.5360865560360253}
4
+ {"step": 838860800, "checkpoint/checkpoint_time": 1.5895927330711856}
5
+ {"step": 1048576000, "checkpoint/checkpoint_time": 1.6095300159649923}
6
+ {"step": 1258291200, "checkpoint/checkpoint_time": 1.5371675649657845}
7
+ {"step": 1468006400, "checkpoint/checkpoint_time": 1.5488120779627934}
8
+ {"step": 1677721600, "checkpoint/checkpoint_time": 1.5724514800822362}
9
+ {"step": 1887436800, "checkpoint/checkpoint_time": 1.5613262739498168}
metrics/jsonlines/model_info.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "model_info/total_params": 71962160, "model_info/trainable_params": 71962160, "model_info/embedding_params": 25741824, "model_info/flops_per_token": 0, "model_info/non_embedding_params": 46220336}
metrics/jsonlines/norm.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/resume.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "resume/resume_step": 0}
metrics/jsonlines/throughput.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
metrics/jsonlines/train.jsonl ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 97.20068372006062, "train/update_time": 96.98365108307917, "train/lr": 0.0009000000000000001, "train/loss": 8.76564884185791, "train/global_grad_norm": 1.4130078554153442}
2
+ {"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 190.58031271304935, "train/update_time": 190.21444558817893, "train/lr": 0.0009997960964140947, "train/loss": 7.474191188812256, "train/global_grad_norm": 0.8227528929710388}
3
+ {"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 454.2337091190275, "train/update_time": 283.44298765144777, "train/lr": 0.0009990914580222257, "train/loss": 7.089620590209961, "train/global_grad_norm": 0.43369758129119873}
4
+ {"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 547.6119900090853, "train/update_time": 376.67628215253353, "train/lr": 0.0009978842768382998, "train/loss": 6.745384216308594, "train/global_grad_norm": 0.30941468477249146}
5
+ {"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 811.2910252050497, "train/update_time": 469.82893142174, "train/lr": 0.0009961757683914405, "train/loss": 6.485077857971191, "train/global_grad_norm": 0.3194890022277832}
6
+ {"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 904.5979733880376, "train/update_time": 562.9825718456414, "train/lr": 0.00099396765300483, "train/loss": 6.212056636810303, "train/global_grad_norm": 0.42947375774383545}
7
+ {"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 1167.0270900100004, "train/update_time": 656.2661910566967, "train/lr": 0.0009912621540634887, "train/loss": 6.0160932540893555, "train/global_grad_norm": 0.40116411447525024}
8
+ {"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 1260.3716635780875, "train/update_time": 749.4701656188117, "train/lr": 0.000988061995775515, "train/loss": 5.953179359436035, "train/global_grad_norm": 1.0869561433792114}
9
+ {"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 1522.4740127840778, "train/update_time": 842.772364483797, "train/lr": 0.0009843704004290394, "train/loss": 5.676019191741943, "train/global_grad_norm": 0.280754029750824}
10
+ {"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1615.8070381759899, "train/update_time": 935.9601778858341, "train/lr": 0.0009801910851476522, "train/loss": 5.544191837310791, "train/global_grad_norm": 0.46917492151260376}
11
+ {"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1880.450380234979, "train/update_time": 1029.0689218619373, "train/lr": 0.0009755282581475768, "train/loss": 5.429840087890625, "train/global_grad_norm": 0.7274503111839294}
12
+ {"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1973.9993880910333, "train/update_time": 1122.4686564019648, "train/lr": 0.0009703866145003512, "train/loss": 5.2810444831848145, "train/global_grad_norm": 0.2969799339771271}
13
+ {"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 2237.285542534082, "train/update_time": 1215.6973058142466, "train/lr": 0.0009647713314052896, "train/loss": 5.193550109863281, "train/global_grad_norm": 0.3795139491558075}
14
+ {"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 2330.883187837084, "train/update_time": 1309.140196379507, "train/lr": 0.0009586880629764817, "train/loss": 5.119898319244385, "train/global_grad_norm": 0.43129095435142517}
15
+ {"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 2593.152709970018, "train/update_time": 1402.5526540756691, "train/lr": 0.0009521429345495787, "train/loss": 4.974904537200928, "train/global_grad_norm": 0.3256843686103821}
16
+ {"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 2686.8860215520253, "train/update_time": 1496.1389014086453, "train/lr": 0.0009451425365140996, "train/loss": 4.9173712730407715, "train/global_grad_norm": 0.39894500374794006}
17
+ {"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 2949.368387905997, "train/update_time": 1589.4967352375388, "train/lr": 0.000937693917677468, "train/loss": 4.806858539581299, "train/global_grad_norm": 0.3854577839374542}
18
+ {"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 3042.959401597036, "train/update_time": 1682.9467192575103, "train/lr": 0.0009298045781674596, "train/loss": 4.758651256561279, "train/global_grad_norm": 0.3088115155696869}
19
+ {"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 3305.444267996005, "train/update_time": 1776.3336466513574, "train/lr": 0.0009214824618802108, "train/loss": 4.72238826751709, "train/global_grad_norm": 0.37908124923706055}
20
+ {"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 3399.0298356669955, "train/update_time": 1869.7792980262311, "train/lr": 0.000912735948481387, "train/loss": 4.6122002601623535, "train/global_grad_norm": 0.4956414997577667}
21
+ {"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 3663.3731433160137, "train/update_time": 1963.0694206270855, "train/lr": 0.0009035738449685707, "train/loss": 4.518575191497803, "train/global_grad_norm": 0.33904722332954407}
22
+ {"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 3756.8895379800815, "train/update_time": 2056.446317301248, "train/lr": 0.0008940053768033609, "train/loss": 4.449559211730957, "train/global_grad_norm": 0.5069319605827332}
23
+ {"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 4019.5361906640464, "train/update_time": 2149.77923363226, "train/lr": 0.0008840401786221159, "train/loss": 4.342103004455566, "train/global_grad_norm": 0.29293933510780334}
24
+ {"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 4112.964794773026, "train/update_time": 2243.0702600192744, "train/lr": 0.0008736882845346905, "train/loss": 4.284090518951416, "train/global_grad_norm": 0.36110353469848633}
25
+ {"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 4376.393655605032, "train/update_time": 2336.412492537289, "train/lr": 0.0008629601180209381, "train/loss": 4.201655387878418, "train/global_grad_norm": 0.3496641218662262}
26
+ {"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 4470.06449480704, "train/update_time": 2429.936827432248, "train/lr": 0.0008518664814351503, "train/loss": 4.142204761505127, "train/global_grad_norm": 0.39031898975372314}
27
+ {"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 4734.132506006979, "train/update_time": 2523.346581262187, "train/lr": 0.0008404185451290017, "train/loss": 4.099599838256836, "train/global_grad_norm": 0.37471985816955566}
28
+ {"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 4827.830736390082, "train/update_time": 2616.8884120163275, "train/lr": 0.0008286278362039527, "train/loss": 4.009082317352295, "train/global_grad_norm": 0.34484514594078064}
29
+ {"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 5090.937317650998, "train/update_time": 2710.3318849493517, "train/lr": 0.0008165062269044352, "train/loss": 3.97619891166687, "train/global_grad_norm": 0.40576431155204773}
30
+ {"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 5184.687426269054, "train/update_time": 2803.9371257049497, "train/lr": 0.0008040659226635089, "train/loss": 3.9234979152679443, "train/global_grad_norm": 0.44408538937568665}
31
+ {"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 5450.616348118056, "train/update_time": 2897.275741666206, "train/lr": 0.0007913194498130252, "train/loss": 3.9410295486450195, "train/global_grad_norm": 0.43244609236717224}
32
+ {"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 5543.975930526038, "train/update_time": 2990.483508925303, "train/lr": 0.000778279642970672, "train/loss": 3.8507468700408936, "train/global_grad_norm": 0.48972949385643005}
33
+ {"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 5806.160792221082, "train/update_time": 3083.611473838333, "train/lr": 0.0007649596321166025, "train/loss": 3.8703322410583496, "train/global_grad_norm": 0.3615987300872803}
34
+ {"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 5899.68232584605, "train/update_time": 3176.9836701630848, "train/lr": 0.0007513728293726579, "train/loss": 3.8107964992523193, "train/global_grad_norm": 0.36133530735969543}
35
+ {"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 6162.256411690032, "train/update_time": 3270.438694642042, "train/lr": 0.0007375329154974975, "train/loss": 3.7763936519622803, "train/global_grad_norm": 0.40389642119407654}
36
+ {"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 6255.94410856301, "train/update_time": 3363.9935755479382, "train/lr": 0.0007234538261112341, "train/loss": 3.7278685569763184, "train/global_grad_norm": 0.42608577013015747}
37
+ {"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 6518.670226996997, "train/update_time": 3457.366311661084, "train/lr": 0.0007091497376634464, "train/loss": 3.7271111011505127, "train/global_grad_norm": 0.27613717317581177}
38
+ {"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 6612.111398222041, "train/update_time": 3550.66807099001, "train/lr": 0.0006946350531586958, "train/loss": 3.699812412261963, "train/global_grad_norm": 0.3713103234767914}
39
+ {"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 6874.636505312985, "train/update_time": 3643.886076678871, "train/lr": 0.0006799243876539214, "train/loss": 3.6990554332733154, "train/global_grad_norm": 0.2907034456729889}
40
+ {"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 6968.171832425054, "train/update_time": 3737.2908036899753, "train/lr": 0.0006650325535423166, "train/loss": 3.581249475479126, "train/global_grad_norm": 0.4145627021789551}
41
+ {"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 7233.323964387993, "train/update_time": 3830.6057550550904, "train/lr": 0.0006499745456385053, "train/loss": 3.623004913330078, "train/global_grad_norm": 0.3520248532295227}
42
+ {"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 7326.922928820015, "train/update_time": 3924.0719714582665, "train/lr": 0.0006347655260800339, "train/loss": 3.6238791942596436, "train/global_grad_norm": 0.379281610250473}
43
+ {"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 7591.2802984340815, "train/update_time": 4017.3633328623837, "train/lr": 0.0006194208090603844, "train/loss": 3.62347412109375, "train/global_grad_norm": 0.36047083139419556}
44
+ {"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 7684.845341357053, "train/update_time": 4110.783207958448, "train/lr": 0.0006039558454088796, "train/loss": 3.6568057537078857, "train/global_grad_norm": 0.49488669633865356}
45
+ {"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 7948.907636292046, "train/update_time": 4204.296549194376, "train/lr": 0.0005883862070330078, "train/loss": 3.6046764850616455, "train/global_grad_norm": 0.3740111291408539}
46
+ {"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 8042.599143097992, "train/update_time": 4297.83159168635, "train/lr": 0.0005727275712388317, "train/loss": 3.572643280029297, "train/global_grad_norm": 0.3081468641757965}
47
+ {"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 8305.275031784084, "train/update_time": 4391.248921588529, "train/lr": 0.0005569957049452703, "train/loss": 3.553377389907837, "train/global_grad_norm": 0.33262893557548523}
48
+ {"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 8398.81305668701, "train/update_time": 4484.626386589487, "train/lr": 0.0005412064488081482, "train/loss": 3.564133644104004, "train/global_grad_norm": 0.4224238991737366}
49
+ {"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 8662.347046800074, "train/update_time": 4577.979232876445, "train/lr": 0.0005253757012699972, "train/loss": 3.5546345710754395, "train/global_grad_norm": 0.2766055762767792}
50
+ {"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 8755.968361919047, "train/update_time": 4671.454424570431, "train/lr": 0.0005095194025516734, "train/loss": 3.536339044570923, "train/global_grad_norm": 0.3986065685749054}
51
+ {"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 9021.309424316045, "train/update_time": 4764.801301084226, "train/lr": 0.0004936535186019053, "train/loss": 3.5047459602355957, "train/global_grad_norm": 0.244685560464859}
52
+ {"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 9114.625705051003, "train/update_time": 4857.963408218231, "train/lr": 0.00047779402502093696, "train/loss": 3.520270586013794, "train/global_grad_norm": 0.4062435030937195}
53
+ {"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 9398.054264329025, "train/update_time": 4970.6049944133265, "train/lr": 0.0004619568909744525, "train/loss": 3.4980483055114746, "train/global_grad_norm": 0.3901892602443695}
54
+ {"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 9606.105255214032, "train/update_time": 5178.118606777163, "train/lr": 0.00044615806311398067, "train/loss": 3.511883497238159, "train/global_grad_norm": 0.34147942066192627}
55
+ {"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 10003.42233235808, "train/update_time": 5362.020449607167, "train/lr": 0.0004304134495199673, "train/loss": 3.468095064163208, "train/global_grad_norm": 0.3075481653213501}
56
+ {"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 10150.524502051994, "train/update_time": 5508.941767138196, "train/lr": 0.0004147389036836882, "train/loss": 3.5062997341156006, "train/global_grad_norm": 0.2764611542224884}
57
+ {"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 10566.198982721078, "train/update_time": 5678.196010601241, "train/lr": 0.0003991502085441259, "train/loss": 3.4300529956817627, "train/global_grad_norm": 0.29730668663978577}
58
+ {"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 10760.646769667044, "train/update_time": 5872.135236821254, "train/lr": 0.0003836630605958888, "train/loss": 3.489516258239746, "train/global_grad_norm": 0.32831132411956787}
59
+ {"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 11156.295638620039, "train/update_time": 6039.766134388163, "train/lr": 0.00036829305408417155, "train/loss": 3.5055065155029297, "train/global_grad_norm": 0.3446820080280304}
60
+ {"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 11310.365665178979, "train/update_time": 6193.65441437799, "train/lr": 0.000353055665302672, "train/loss": 3.5088326930999756, "train/global_grad_norm": 0.2981908321380615}
61
+ {"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 11736.789658116992, "train/update_time": 6379.89381142694, "train/lr": 0.0003379662370102746, "train/loss": 3.466792345046997, "train/global_grad_norm": 0.33163660764694214}
62
+ {"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 11914.080722624087, "train/update_time": 6556.690899457899, "train/lr": 0.00032303996298219405, "train/loss": 3.4406988620758057, "train/global_grad_norm": 0.28660744428634644}
63
+ {"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 12308.019568020012, "train/update_time": 6703.663481310825, "train/lr": 0.00030829187271113034, "train/loss": 3.4547548294067383, "train/global_grad_norm": 0.29186901450157166}
64
+ {"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 12483.778021455975, "train/update_time": 6879.228759005899, "train/lr": 0.0002937368162738445, "train/loss": 3.4471235275268555, "train/global_grad_norm": 0.27723199129104614}
65
+ {"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 12905.641336273053, "train/update_time": 7080.2444989720825, "train/lr": 0.0002793894493783894, "train/loss": 3.409696578979492, "train/global_grad_norm": 0.2166973203420639}
66
+ {"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 13068.73947558098, "train/update_time": 7242.851195934112, "train/lr": 0.00026526421860705474, "train/loss": 3.4526355266571045, "train/global_grad_norm": 0.3342931270599365}
67
+ {"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 13461.124407541007, "train/update_time": 7389.911005128175, "train/lr": 0.0002513753468698824, "train/loss": 3.3823962211608887, "train/global_grad_norm": 0.24517573416233063}
68
+ {"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 13651.783316303976, "train/update_time": 7580.372131183161, "train/lr": 0.00023773681908340283, "train/loss": 3.412515163421631, "train/global_grad_norm": 0.3234269320964813}
69
+ {"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 14071.216295699007, "train/update_time": 7793.8012771270005, "train/lr": 0.00022436236808900823, "train/loss": 3.4163782596588135, "train/global_grad_norm": 0.22176125645637512}
70
+ {"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 14221.494554364006, "train/update_time": 7943.600337238051, "train/lr": 0.00021126546082514682, "train/loss": 3.4145960807800293, "train/global_grad_norm": 0.24960394203662872}
71
+ {"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 14616.199775510002, "train/update_time": 8090.416964606033, "train/lr": 0.00019845928476725522, "train/loss": 3.4254729747772217, "train/global_grad_norm": 0.21741226315498352}
72
+ {"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 14821.623726570979, "train/update_time": 8295.638736715424, "train/lr": 0.0001859567346490913, "train/loss": 3.3944356441497803, "train/global_grad_norm": 0.2173576056957245}
73
+ {"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 15232.941850902047, "train/update_time": 8512.26620015048, "train/lr": 0.00017377039947882782, "train/loss": 3.4161322116851807, "train/global_grad_norm": 0.19619792699813843}
74
+ {"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 15380.238736488041, "train/update_time": 8659.387788099353, "train/lr": 0.00016191254986299043, "train/loss": 3.377074718475342, "train/global_grad_norm": 0.18498826026916504}
75
+ {"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 15773.814038521028, "train/update_time": 8806.402406750247, "train/lr": 0.00015039512565099468, "train/loss": 3.3878912925720215, "train/global_grad_norm": 0.23870347440242767}
76
+ {"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 15988.252661015023, "train/update_time": 9020.618778260308, "train/lr": 0.00013922972391273224, "train/loss": 3.3374383449554443, "train/global_grad_norm": 0.19240827858448029}
77
+ {"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 16392.140212937025, "train/update_time": 9237.03149983345, "train/lr": 0.00012842758726130281, "train/loss": 3.408134937286377, "train/global_grad_norm": 0.20149269700050354}
78
+ {"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 16539.33981846203, "train/update_time": 9384.053210339276, "train/lr": 0.00011799959253265679, "train/loss": 3.354210615158081, "train/global_grad_norm": 0.18948209285736084}
79
+ {"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 16932.73472051299, "train/update_time": 9530.820382265258, "train/lr": 0.00010795623983354214, "train/loss": 3.369267225265503, "train/global_grad_norm": 0.1685154139995575}
80
+ {"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 17149.437690322986, "train/update_time": 9747.28814456216, "train/lr": 9.830764196878872e-05, "train/loss": 3.3556768894195557, "train/global_grad_norm": 0.1737906038761139}
81
+ {"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 17549.297566386987, "train/update_time": 9954.213539150194, "train/lr": 8.906351425856951e-05, "train/loss": 3.3356785774230957, "train/global_grad_norm": 0.1702912300825119}
82
+ {"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 17696.358517175075, "train/update_time": 10101.101109507843, "train/lr": 8.02331647558977e-05, "train/loss": 3.3448057174682617, "train/global_grad_norm": 0.16285409033298492}
83
+ {"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 18089.859054057975, "train/update_time": 10248.098395384499, "train/lr": 7.182548487420554e-05, "train/loss": 3.379516363143921, "train/global_grad_norm": 0.1631559580564499}
84
+ {"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 18306.998820198, "train/update_time": 10465.017247352516, "train/lr": 6.384894043444556e-05, "train/loss": 3.309666872024536, "train/global_grad_norm": 0.15072140097618103}
85
+ {"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 18706.20748156903, "train/update_time": 10662.364285671501, "train/lr": 5.6311563140726166e-05, "train/loss": 3.3882648944854736, "train/global_grad_norm": 0.15730835497379303}
86
+ {"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 18853.344159234082, "train/update_time": 10809.319413250545, "train/lr": 4.922094249306547e-05, "train/loss": 3.371917724609375, "train/global_grad_norm": 0.1638113260269165}
87
+ {"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 19254.14253187005, "train/update_time": 10965.228872205596, "train/lr": 4.2584218145409916e-05, "train/loss": 3.314046621322632, "train/global_grad_norm": 0.14649039506912231}
88
+ {"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 19462.212121990975, "train/update_time": 11172.7729124584, "train/lr": 3.6408072716606236e-05, "train/loss": 3.328913688659668, "train/global_grad_norm": 0.14682640135288239}
89
+ {"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 19858.031892596046, "train/update_time": 11356.353174015298, "train/lr": 3.069872506157217e-05, "train/loss": 3.39823842048645, "train/global_grad_norm": 0.14888012409210205}
90
+ {"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 20004.831071814988, "train/update_time": 11502.97223422851, "train/lr": 2.5461924009435368e-05, "train/loss": 3.309504508972168, "train/global_grad_norm": 0.13238337635993958}
91
+ {"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 20424.197218824993, "train/update_time": 11674.226087162388, "train/lr": 2.0702942574950812e-05, "train/loss": 3.359825849533081, "train/global_grad_norm": 0.14266924560070038}
92
+ {"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 20616.578735099058, "train/update_time": 11866.318235529237, "train/lr": 1.642657264902142e-05, "train/loss": 3.3724734783172607, "train/global_grad_norm": 0.1331755369901657}
93
+ {"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 21010.326182788005, "train/update_time": 12031.375137096155, "train/lr": 1.2637120173670358e-05, "train/loss": 3.365408182144165, "train/global_grad_norm": 0.12957783043384552}
94
+ {"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 21166.986645585042, "train/update_time": 12187.849889185163, "train/lr": 9.338400806321978e-06, "train/loss": 3.319882869720459, "train/global_grad_norm": 0.12174921482801437}
95
+ {"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 21589.57625599904, "train/update_time": 12373.759245545021, "train/lr": 6.533736077758867e-06, "train/loss": 3.326748847961426, "train/global_grad_norm": 0.120763398706913}
96
+ {"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 21767.68956994405, "train/update_time": 12551.347988351132, "train/lr": 4.2259500476214406e-06, "train/loss": 3.3314108848571777, "train/global_grad_norm": 0.12533509731292725}
97
+ {"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 22159.499484215048, "train/update_time": 12698.275234204368, "train/lr": 2.417366460819359e-06, "train/loss": 3.3585853576660156, "train/global_grad_norm": 0.11902791261672974}
98
+ {"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 22335.74923630804, "train/update_time": 12874.329680579598, "train/lr": 1.1098064077174619e-06, "train/loss": 3.3340089321136475, "train/global_grad_norm": 0.1221228837966919}
metrics/jsonlines/train_data_info.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024}
metrics/jsonlines/train_eval.jsonl ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 811.2910252050497, "train_eval/train_update_time": 469.82893142174, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 7.719095360425272, "train_eval/perplexity_len_2048": 2250.922385774329, "train_eval/loss_avg_len_1024": 7.719504372100928, "train_eval/perplexity_len_1024": 2251.843227615802, "train_eval/loss_avg_len_512": 7.719377458653034, "train_eval/perplexity_len_512": 2251.5574565621464}
2
+ {"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1615.8070381759899, "train_eval/train_update_time": 935.9601778858341, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.957939318534446, "train_eval/perplexity_len_2048": 386.81220564928583, "train_eval/loss_avg_len_1024": 5.961814671365428, "train_eval/perplexity_len_1024": 388.31414782371087, "train_eval/loss_avg_len_512": 5.967393189926952, "train_eval/perplexity_len_512": 390.4864188989083}
3
+ {"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2593.152709970018, "train_eval/train_update_time": 1402.5526540756691, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.251482137483999, "train_eval/perplexity_len_2048": 190.8489232832105, "train_eval/loss_avg_len_1024": 5.2583443090137365, "train_eval/perplexity_len_1024": 192.16306510751045, "train_eval/loss_avg_len_512": 5.272100609642803, "train_eval/perplexity_len_512": 194.824783727455}
4
+ {"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3399.0298356669955, "train_eval/train_update_time": 1869.7792980262311, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.78972842358995, "train_eval/perplexity_len_2048": 120.26870208532655, "train_eval/loss_avg_len_1024": 4.803808954130364, "train_eval/perplexity_len_1024": 121.974127690181, "train_eval/loss_avg_len_512": 4.828457922573944, "train_eval/perplexity_len_512": 125.0180244583548}
5
+ {"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4376.393655605032, "train_eval/train_update_time": 2336.412492537289, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.40212876008818, "train_eval/perplexity_len_2048": 81.6244427065281, "train_eval/loss_avg_len_1024": 4.431100291744224, "train_eval/perplexity_len_1024": 84.02381677946093, "train_eval/loss_avg_len_512": 4.4801360191248385, "train_eval/perplexity_len_512": 88.24667509487}
6
+ {"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5184.687426269054, "train_eval/train_update_time": 2803.9371257049497, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.066799368759794, "train_eval/perplexity_len_2048": 58.36984295830003, "train_eval/loss_avg_len_1024": 4.114848925766128, "train_eval/perplexity_len_1024": 61.2429613303667, "train_eval/loss_avg_len_512": 4.189334867373182, "train_eval/perplexity_len_512": 65.97889164907912}
7
+ {"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6162.256411690032, "train_eval/train_update_time": 3270.438694642042, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.8533093321500744, "train_eval/perplexity_len_2048": 47.148836497135065, "train_eval/loss_avg_len_1024": 3.9121336809778766, "train_eval/perplexity_len_1024": 50.00553408372476, "train_eval/loss_avg_len_512": 3.9987834947431473, "train_eval/perplexity_len_512": 54.531771479730125}
8
+ {"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6968.171832425054, "train_eval/train_update_time": 3737.2908036899753, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.722734505148255, "train_eval/perplexity_len_2048": 41.37738622509416, "train_eval/loss_avg_len_1024": 3.785098307309199, "train_eval/perplexity_len_1024": 44.039999799825864, "train_eval/loss_avg_len_512": 3.878930976334377, "train_eval/perplexity_len_512": 48.3724760994647}
9
+ {"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7948.907636292046, "train_eval/train_update_time": 4204.296549194376, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.6312928089601515, "train_eval/perplexity_len_2048": 37.76160361375511, "train_eval/loss_avg_len_1024": 3.6986304339887464, "train_eval/perplexity_len_1024": 40.39194702318541, "train_eval/loss_avg_len_512": 3.7954792477576484, "train_eval/perplexity_len_512": 44.499557609251696}
10
+ {"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8755.968361919047, "train_eval/train_update_time": 4671.454424570431, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.566865509475556, "train_eval/perplexity_len_2048": 35.40544102059099, "train_eval/loss_avg_len_1024": 3.6334003103244683, "train_eval/perplexity_len_1024": 37.841270164084655, "train_eval/loss_avg_len_512": 3.731213201559185, "train_eval/perplexity_len_512": 41.72970400831208}
11
+ {"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10003.42233235808, "train_eval/train_update_time": 5362.020449607167, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.5099057611342683, "train_eval/perplexity_len_2048": 33.445115805650026, "train_eval/loss_avg_len_1024": 3.5771838447961635, "train_eval/perplexity_len_1024": 35.77265750657162, "train_eval/loss_avg_len_512": 3.674278985551646, "train_eval/perplexity_len_512": 39.420224057099915}
12
+ {"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11310.365665178979, "train_eval/train_update_time": 6193.65441437799, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.4691960758760523, "train_eval/perplexity_len_2048": 32.110917324346254, "train_eval/loss_avg_len_1024": 3.5349628742179267, "train_eval/perplexity_len_1024": 34.29374150862089, "train_eval/loss_avg_len_512": 3.634656662457237, "train_eval/perplexity_len_512": 37.88884200179451}
13
+ {"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12905.641336273053, "train_eval/train_update_time": 7080.2444989720825, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.437814879390981, "train_eval/perplexity_len_2048": 31.118885304078464, "train_eval/loss_avg_len_1024": 3.508632168282675, "train_eval/perplexity_len_1024": 33.402547458364594, "train_eval/loss_avg_len_512": 3.6075781844702215, "train_eval/perplexity_len_512": 36.876636173050585}
14
+ {"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 14221.494554364006, "train_eval/train_update_time": 7943.600337238051, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.4099421536454972, "train_eval/perplexity_len_2048": 30.263493575987354, "train_eval/loss_avg_len_1024": 3.4811164568568342, "train_eval/perplexity_len_1024": 32.49598219280798, "train_eval/loss_avg_len_512": 3.5819485398161484, "train_eval/perplexity_len_512": 35.94351001646744}
15
+ {"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 15773.814038521028, "train_eval/train_update_time": 8806.402406750247, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.389845041899543, "train_eval/perplexity_len_2048": 29.66135564690801, "train_eval/loss_avg_len_1024": 3.4604284336998896, "train_eval/perplexity_len_1024": 31.830610900141725, "train_eval/loss_avg_len_512": 3.562013453548534, "train_eval/perplexity_len_512": 35.234067932753184}
16
+ {"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 17149.437690322986, "train_eval/train_update_time": 9747.28814456216, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.3741092623544136, "train_eval/perplexity_len_2048": 29.19826420153563, "train_eval/loss_avg_len_1024": 3.446687585645268, "train_eval/perplexity_len_1024": 31.396222578985455, "train_eval/loss_avg_len_512": 3.547421754104362, "train_eval/perplexity_len_512": 34.723675803484795}
17
+ {"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 18706.20748156903, "train_eval/train_update_time": 10662.364285671501, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.356781773958419, "train_eval/perplexity_len_2048": 28.696689680159, "train_eval/loss_avg_len_1024": 3.4224050301090756, "train_eval/perplexity_len_1024": 30.643023864854985, "train_eval/loss_avg_len_512": 3.523673592355699, "train_eval/perplexity_len_512": 33.90876692657728}
18
+ {"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 20004.831071814988, "train_eval/train_update_time": 11502.97223422851, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.3524864197691384, "train_eval/perplexity_len_2048": 28.57369158322852, "train_eval/loss_avg_len_1024": 3.4213956192670048, "train_eval/perplexity_len_1024": 30.612108070326094, "train_eval/loss_avg_len_512": 3.522944682840425, "train_eval/perplexity_len_512": 33.8840595095429}
19
+ {"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 21589.57625599904, "train_eval/train_update_time": 12373.759245545021, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.342387769561974, "train_eval/perplexity_len_2048": 28.28658798616546, "train_eval/loss_avg_len_1024": 3.414338314473171, "train_eval/perplexity_len_1024": 30.396829629582328, "train_eval/loss_avg_len_512": 3.5140870866559273, "train_eval/perplexity_len_512": 33.58525349842418}
metrics/jsonlines/val.jsonl ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 190.58031271304935, "val/train_update_time": 190.21444558817893, "val/loss": 7.450875304234563, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.25829177699052, "val/val_tokens_per_second": 240575.65462744483, "val/loss_avg_len_2048": 7.450875304234563, "val/perplexity_len_2048": 1721.3692079052364, "val/loss_avg_len_1024": 7.449340622551227, "val/perplexity_len_1024": 1718.7294802005856, "val/loss_avg_len_512": 7.449990987825952, "val/perplexity_len_512": 1719.8476457397608}
2
+ {"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 547.6119900090853, "val/train_update_time": 376.67628215253353, "val/loss": 6.7323331680220555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.36321504006628, "val/val_tokens_per_second": 240427.48894100738, "val/loss_avg_len_2048": 6.7323331680220555, "val/perplexity_len_2048": 839.1027515359466, "val/loss_avg_len_1024": 6.731641844541114, "val/perplexity_len_1024": 838.5228605701034, "val/loss_avg_len_512": 6.734119094878901, "val/perplexity_len_512": 840.6026666462375}
3
+ {"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 904.5979733880376, "val/train_update_time": 562.9825718456414, "val/loss": 6.202562174819015, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.99627296999097, "val/val_tokens_per_second": 242372.20904435776, "val/loss_avg_len_2048": 6.202562174819015, "val/perplexity_len_2048": 494.01316904528653, "val/loss_avg_len_1024": 6.203721904096799, "val/perplexity_len_1024": 494.5864229264433, "val/loss_avg_len_512": 6.209107841835264, "val/perplexity_len_512": 497.25742106468925}
4
+ {"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 1260.3716635780875, "val/train_update_time": 749.4701656188117, "val/loss": 5.854788969698548, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.6496025499655, "val/val_tokens_per_second": 242870.42116132387, "val/loss_avg_len_2048": 5.854788969698548, "val/perplexity_len_2048": 348.9012635456685, "val/loss_avg_len_1024": 5.85749615726755, "val/perplexity_len_1024": 349.84708438871354, "val/loss_avg_len_512": 5.865908991570771, "val/perplexity_len_512": 352.8027050825446}
5
+ {"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1615.8070381759899, "val/train_update_time": 935.9601778858341, "val/loss": 5.549726753812819, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.84832797700074, "val/val_tokens_per_second": 241156.33334669282, "val/loss_avg_len_2048": 5.549726753812819, "val/perplexity_len_2048": 257.16727632667454, "val/loss_avg_len_1024": 5.554625083155884, "val/perplexity_len_1024": 258.4300565742626, "val/loss_avg_len_512": 5.565984733520728, "val/perplexity_len_512": 261.38246909910544}
6
+ {"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1973.9993880910333, "val/train_update_time": 1122.4686564019648, "val/loss": 5.291026324360166, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.91353944293223, "val/val_tokens_per_second": 241063.779462713, "val/loss_avg_len_2048": 5.291026324360166, "val/perplexity_len_2048": 198.5470945960036, "val/loss_avg_len_1024": 5.297933264936833, "val/perplexity_len_1024": 199.92319442927527, "val/loss_avg_len_512": 5.312294404878934, "val/perplexity_len_512": 202.81503476251478}
7
+ {"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 2330.883187837084, "val/train_update_time": 1309.140196379507, "val/loss": 5.082773989376356, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.71492548892274, "val/val_tokens_per_second": 242776.3867440957, "val/loss_avg_len_2048": 5.082773989376356, "val/perplexity_len_2048": 161.22066060367317, "val/loss_avg_len_1024": 5.091564861865482, "val/perplexity_len_1024": 162.64417868926273, "val/loss_avg_len_512": 5.109285918890592, "val/perplexity_len_512": 165.55209503151025}
8
+ {"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 2686.8860215520253, "val/train_update_time": 1496.1389014086453, "val/loss": 4.900254244546081, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.97393367695622, "val/val_tokens_per_second": 242404.25199727662, "val/loss_avg_len_2048": 4.900254244546081, "val/perplexity_len_2048": 134.3239264696477, "val/loss_avg_len_1024": 4.91143611547025, "val/perplexity_len_1024": 135.83434821009394, "val/loss_avg_len_512": 4.93267407396678, "val/perplexity_len_512": 138.7500445859649}
9
+ {"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 3042.959401597036, "val/train_update_time": 1682.9467192575103, "val/loss": 4.739786327227112, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.96092426101677, "val/val_tokens_per_second": 242422.91629941342, "val/loss_avg_len_2048": 4.739786327227112, "val/perplexity_len_2048": 114.40975281909341, "val/loss_avg_len_1024": 4.754103896766296, "val/perplexity_len_1024": 116.05960516499653, "val/loss_avg_len_512": 4.780596644866559, "val/perplexity_len_512": 119.17543424775661}
10
+ {"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 3399.0298356669955, "val/train_update_time": 1869.7792980262311, "val/loss": 4.605688608644693, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.33557621506043, "val/val_tokens_per_second": 241886.5599038667, "val/loss_avg_len_2048": 4.605688608644693, "val/perplexity_len_2048": 100.05185570608518, "val/loss_avg_len_1024": 4.625144383868621, "val/perplexity_len_1024": 102.0175017009493, "val/loss_avg_len_512": 4.659852839846723, "val/perplexity_len_512": 105.62053787839496}
11
+ {"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 3756.8895379800815, "val/train_update_time": 2056.446317301248, "val/loss": 4.442701461410919, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.17154206091072, "val/val_tokens_per_second": 242121.1008719908, "val/loss_avg_len_2048": 4.442701461410919, "val/perplexity_len_2048": 85.0042675253757, "val/loss_avg_len_1024": 4.471159290939104, "val/perplexity_len_1024": 87.4580535953644, "val/loss_avg_len_512": 4.518184423743747, "val/perplexity_len_512": 91.66901471505673}
12
+ {"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 4112.964794773026, "val/train_update_time": 2243.0702600192744, "val/loss": 4.2837813609120206, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.93256872706115, "val/val_tokens_per_second": 241036.78480720375, "val/loss_avg_len_2048": 4.2837813609120206, "val/perplexity_len_2048": 72.51412430688201, "val/loss_avg_len_1024": 4.324054979363084, "val/perplexity_len_1024": 75.4941356224282, "val/loss_avg_len_512": 4.385897227285058, "val/perplexity_len_512": 80.31024744229244}
13
+ {"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 4470.06449480704, "val/train_update_time": 2429.936827432248, "val/loss": 4.142301944319485, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.5037173400633, "val/val_tokens_per_second": 240229.36648534652, "val/loss_avg_len_2048": 4.142301944319485, "val/perplexity_len_2048": 62.94755656946625, "val/loss_avg_len_1024": 4.190639730950771, "val/perplexity_len_1024": 66.0650412962264, "val/loss_avg_len_512": 4.262273591935914, "val/perplexity_len_512": 70.97115961662232}
14
+ {"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 4827.830736390082, "val/train_update_time": 2616.8884120163275, "val/loss": 4.03030806598065, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.51862163702026, "val/val_tokens_per_second": 241625.3719175768, "val/loss_avg_len_2048": 4.03030806598065, "val/perplexity_len_2048": 56.27824598990862, "val/loss_avg_len_1024": 4.084947683577659, "val/perplexity_len_1024": 59.43882804306883, "val/loss_avg_len_512": 4.163849879008346, "val/perplexity_len_512": 64.31866563997396}
15
+ {"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 5184.687426269054, "val/train_update_time": 2803.9371257049497, "val/loss": 3.93689947860057, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.90034897695296, "val/val_tokens_per_second": 239671.83358720774, "val/loss_avg_len_2048": 3.93689947860057, "val/perplexity_len_2048": 51.259423721129075, "val/loss_avg_len_1024": 3.9956295500188603, "val/perplexity_len_1024": 54.36005222562956, "val/loss_avg_len_512": 4.079135575135518, "val/perplexity_len_512": 59.09436512640163}
16
+ {"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 5543.975930526038, "val/train_update_time": 2990.483508925303, "val/loss": 3.873067468292034, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.9153455970809, "val/val_tokens_per_second": 242488.3296139546, "val/loss_avg_len_2048": 3.873067468292034, "val/perplexity_len_2048": 48.08967361439601, "val/loss_avg_len_1024": 3.935505072252453, "val/perplexity_len_1024": 51.18799706575565, "val/loss_avg_len_512": 4.0234172149434695, "val/perplexity_len_512": 55.891774069637314}
17
+ {"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 5899.68232584605, "val/train_update_time": 3176.9836701630848, "val/loss": 3.8065103932873816, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.98372465907596, "val/val_tokens_per_second": 242390.2070015124, "val/loss_avg_len_2048": 3.8065103932873816, "val/perplexity_len_2048": 44.99315617874155, "val/loss_avg_len_1024": 3.8711530834252486, "val/perplexity_len_1024": 47.99769953596133, "val/loss_avg_len_512": 3.961617600578815, "val/perplexity_len_512": 52.542249617687304}
18
+ {"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 6255.94410856301, "val/train_update_time": 3363.9935755479382, "val/loss": 3.7648755942319054, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.21118752204347, "val/val_tokens_per_second": 242064.37292843926, "val/loss_avg_len_2048": 3.7648755942319054, "val/perplexity_len_2048": 43.158336379882776, "val/loss_avg_len_1024": 3.8314554831946737, "val/perplexity_len_1024": 46.129630298360155, "val/loss_avg_len_512": 3.923248363528401, "val/perplexity_len_512": 50.564429942215185}
19
+ {"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 6612.111398222041, "val/train_update_time": 3550.66807099001, "val/loss": 3.7117092974703993, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.1644801320508, "val/val_tokens_per_second": 242131.20844296852, "val/loss_avg_len_2048": 3.7117092974703993, "val/perplexity_len_2048": 40.923697550401094, "val/loss_avg_len_1024": 3.7795683319184, "val/perplexity_len_1024": 43.797131830589514, "val/loss_avg_len_512": 3.873812707895227, "val/perplexity_len_512": 48.125525301067185}
20
+ {"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 6968.171832425054, "val/train_update_time": 3737.2908036899753, "val/loss": 3.6820711012163434, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.1077867250424, "val/val_tokens_per_second": 240788.50703176, "val/loss_avg_len_2048": 3.6820711012163434, "val/perplexity_len_2048": 39.72859085710849, "val/loss_avg_len_1024": 3.75097957880958, "val/perplexity_len_1024": 42.56275515868964, "val/loss_avg_len_512": 3.8466630883938633, "val/perplexity_len_512": 46.836512876088385}
21
+ {"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 7326.922928820015, "val/train_update_time": 3924.0719714582665, "val/loss": 3.641011830084678, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.91282955405768, "val/val_tokens_per_second": 239654.33201750863, "val/loss_avg_len_2048": 3.641011830084678, "val/perplexity_len_2048": 38.13039869757022, "val/loss_avg_len_1024": 3.710436346722953, "val/perplexity_len_1024": 40.871636841405206, "val/loss_avg_len_512": 3.806864157543611, "val/perplexity_len_512": 45.009075964932165}
22
+ {"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 7684.845341357053, "val/train_update_time": 4110.783207958448, "val/loss": 3.61963528591136, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.39240577502642, "val/val_tokens_per_second": 240386.30016223004, "val/loss_avg_len_2048": 3.61963528591136, "val/perplexity_len_2048": 37.32395276798534, "val/loss_avg_len_1024": 3.6890140571806116, "val/perplexity_len_1024": 40.00538448504298, "val/loss_avg_len_512": 3.7862415502706543, "val/perplexity_len_512": 44.09037701082852}
23
+ {"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 8042.599143097992, "val/train_update_time": 4297.83159168635, "val/loss": 3.5845805286582095, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.11018539394718, "val/val_tokens_per_second": 242208.94740658268, "val/loss_avg_len_2048": 3.5845805286582095, "val/perplexity_len_2048": 36.03823753998343, "val/loss_avg_len_1024": 3.6556578606538475, "val/perplexity_len_1024": 38.69296732499727, "val/loss_avg_len_512": 3.753422931008786, "val/perplexity_len_512": 42.66687811283449}
24
+ {"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 8398.81305668701, "val/train_update_time": 4484.626386589487, "val/loss": 3.5655147499182727, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.01293277891818, "val/val_tokens_per_second": 240922.84822392694, "val/loss_avg_len_2048": 3.5655147499182727, "val/perplexity_len_2048": 35.35764906773771, "val/loss_avg_len_1024": 3.6369095906229223, "val/perplexity_len_1024": 37.97429906927659, "val/loss_avg_len_512": 3.736021636988502, "val/perplexity_len_512": 41.93084178698823}
25
+ {"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 8755.968361919047, "val/train_update_time": 4671.454424570431, "val/loss": 3.540912869877019, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.22161650797352, "val/val_tokens_per_second": 240627.48809626861, "val/loss_avg_len_2048": 3.540912869877019, "val/perplexity_len_2048": 34.498397368694555, "val/loss_avg_len_1024": 3.612297219974548, "val/perplexity_len_1024": 37.05106958345076, "val/loss_avg_len_512": 3.7114113950682803, "val/perplexity_len_512": 40.91150809832091}
26
+ {"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 9114.625705051003, "val/train_update_time": 4857.963408218231, "val/loss": 3.523965722436574, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.6301947310567, "val/val_tokens_per_second": 240051.29962232176, "val/loss_avg_len_2048": 3.523965722436574, "val/perplexity_len_2048": 33.91867414442962, "val/loss_avg_len_1024": 3.5953900306275113, "val/perplexity_len_1024": 36.42990599640571, "val/loss_avg_len_512": 3.694821816809941, "val/perplexity_len_512": 40.238402142307585}
27
+ {"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 9606.105255214032, "val/train_update_time": 5178.118606777163, "val/loss": 3.5034193539332135, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 213.20502558199223, "val/val_tokens_per_second": 192115.546470775, "val/loss_avg_len_2048": 3.5034193539332135, "val/perplexity_len_2048": 33.228879223062926, "val/loss_avg_len_1024": 3.575615437129885, "val/perplexity_len_1024": 35.71659537192833, "val/loss_avg_len_512": 3.6756443774067797, "val/perplexity_len_512": 39.47408487214642}
28
+ {"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 10150.524502051994, "val/train_update_time": 5508.941767138196, "val/loss": 3.485983414201392, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.22538287297357, "val/val_tokens_per_second": 166351.6552277271, "val/loss_avg_len_2048": 3.485983414201392, "val/perplexity_len_2048": 32.654524247766986, "val/loss_avg_len_1024": 3.558595063583925, "val/perplexity_len_1024": 35.113829776059404, "val/loss_avg_len_512": 3.6589644041204825, "val/perplexity_len_512": 38.82111905618587}
29
+ {"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 10760.646769667044, "val/train_update_time": 5872.135236821254, "val/loss": 3.4716991148468344, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 227.81734533095732, "val/val_tokens_per_second": 179793.1581570145, "val/loss_avg_len_2048": 3.4716991148468344, "val/perplexity_len_2048": 32.191392876501965, "val/loss_avg_len_1024": 3.5444762171712245, "val/perplexity_len_1024": 34.6215464207269, "val/loss_avg_len_512": 3.645251961359289, "val/perplexity_len_512": 38.29241984632693}
30
+ {"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 11310.365665178979, "val/train_update_time": 6193.65441437799, "val/loss": 3.458023446854088, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 238.1312235830119, "val/val_tokens_per_second": 172006.00317631784, "val/loss_avg_len_2048": 3.458023446854088, "val/perplexity_len_2048": 31.75415067939918, "val/loss_avg_len_1024": 3.531052290999843, "val/perplexity_len_1024": 34.15989485839664, "val/loss_avg_len_512": 3.63209585030973, "val/perplexity_len_512": 37.791939921908}
31
+ {"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 11914.080722624087, "val/train_update_time": 6556.690899457899, "val/loss": 3.4443763262484692, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.78844607900828, "val/val_tokens_per_second": 165972.1135684238, "val/loss_avg_len_2048": 3.4443763262484692, "val/perplexity_len_2048": 31.323741558011843, "val/loss_avg_len_1024": 3.5173761587224903, "val/perplexity_len_1024": 33.69589967943844, "val/loss_avg_len_512": 3.618587289446686, "val/perplexity_len_512": 37.28485788666368}
32
+ {"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 12483.778021455975, "val/train_update_time": 6879.228759005899, "val/loss": 3.4335085181755947, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 220.31813231599517, "val/val_tokens_per_second": 185912.97760845392, "val/loss_avg_len_2048": 3.4335085181755947, "val/perplexity_len_2048": 30.98516427551629, "val/loss_avg_len_1024": 3.5070164691339714, "val/perplexity_len_1024": 33.34862256580393, "val/loss_avg_len_512": 3.6083294949505476, "val/perplexity_len_512": 36.90435238672373}
33
+ {"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 13068.73947558098, "val/train_update_time": 7242.851195934112, "val/loss": 3.4239009072753603, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 245.130507607013, "val/val_tokens_per_second": 167094.6647965419, "val/loss_avg_len_2048": 3.4239009072753603, "val/perplexity_len_2048": 30.68889636581984, "val/loss_avg_len_1024": 3.4978476569967345, "val/perplexity_len_1024": 33.044252797363725, "val/loss_avg_len_512": 3.5992340820400974, "val/perplexity_len_512": 36.570213930693726}
34
+ {"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 13651.783316303976, "val/train_update_time": 7580.372131183161, "val/loss": 3.4135220782420834, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 205.47673456801567, "val/val_tokens_per_second": 199341.30297579587, "val/loss_avg_len_2048": 3.4135220782420834, "val/perplexity_len_2048": 30.372028758989988, "val/loss_avg_len_1024": 3.4871821906892118, "val/perplexity_len_1024": 32.69369319637278, "val/loss_avg_len_512": 3.5891118032899687, "val/perplexity_len_512": 36.201907227628595}
35
+ {"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 14221.494554364006, "val/train_update_time": 7943.600337238051, "val/loss": 3.40461270494815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.15931547700893, "val/val_tokens_per_second": 166396.30281969009, "val/loss_avg_len_2048": 3.40461270494815, "val/perplexity_len_2048": 30.102634864436293, "val/loss_avg_len_1024": 3.4784065833980686, "val/perplexity_len_1024": 32.40804140117123, "val/loss_avg_len_512": 3.5805338707463816, "val/perplexity_len_512": 35.89269779430199}
36
+ {"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 14821.623726570979, "val/train_update_time": 8295.638736715424, "val/loss": 3.3961672243999317, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 194.16551748092752, "val/val_tokens_per_second": 210954.0382422611, "val/loss_avg_len_2048": 3.3961672243999317, "val/perplexity_len_2048": 29.849474183786675, "val/loss_avg_len_1024": 3.469946994170826, "val/perplexity_len_1024": 32.1350390552181, "val/loss_avg_len_512": 3.5720793008117004, "val/perplexity_len_512": 35.590519663690706}
37
+ {"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 15380.238736488041, "val/train_update_time": 8659.387788099353, "val/loss": 3.3896002479799794, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.37208192201797, "val/val_tokens_per_second": 166252.60330009597, "val/loss_avg_len_2048": 3.3896002479799794, "val/perplexity_len_2048": 29.654095616041587, "val/loss_avg_len_1024": 3.463640005337913, "val/perplexity_len_1024": 31.933001516723323, "val/loss_avg_len_512": 3.5656460781862034, "val/perplexity_len_512": 35.36229283146993}
38
+ {"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 15988.252661015023, "val/train_update_time": 9020.618778260308, "val/loss": 3.3830379756949847, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 186.93504923197906, "val/val_tokens_per_second": 219113.5379281937, "val/loss_avg_len_2048": 3.3830379756949847, "val/perplexity_len_2048": 29.46013447520526, "val/loss_avg_len_1024": 3.4570769355883826, "val/perplexity_len_1024": 31.724109237572595, "val/loss_avg_len_512": 3.5594564177378083, "val/perplexity_len_512": 35.144088248955605}
39
+ {"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 16539.33981846203, "val/train_update_time": 9384.053210339276, "val/loss": 3.377662815920031, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.44063618697692, "val/val_tokens_per_second": 166206.35554975295, "val/loss_avg_len_2048": 3.377662815920031, "val/perplexity_len_2048": 29.302206370051053, "val/loss_avg_len_1024": 3.451863897304051, "val/perplexity_len_1024": 31.55916055702456, "val/loss_avg_len_512": 3.5541689533909784, "val/perplexity_len_512": 34.95875553717333}
40
+ {"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 17149.437690322986, "val/train_update_time": 9747.28814456216, "val/loss": 3.3723305154115426, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 191.12918155093212, "val/val_tokens_per_second": 214305.3178359631, "val/loss_avg_len_2048": 3.3723305154115426, "val/perplexity_len_2048": 29.14637404176299, "val/loss_avg_len_1024": 3.446441471570078, "val/perplexity_len_1024": 31.388496477491135, "val/loss_avg_len_512": 3.548855001350306, "val/perplexity_len_512": 34.773479097890984}
41
+ {"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 17696.358517175075, "val/train_update_time": 10101.101109507843, "val/loss": 3.368293362279772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.3307274680119, "val/val_tokens_per_second": 166280.5140918483, "val/loss_avg_len_2048": 3.368293362279772, "val/perplexity_len_2048": 29.02894286982928, "val/loss_avg_len_1024": 3.4423994922754817, "val/perplexity_len_1024": 31.26188088582337, "val/loss_avg_len_512": 3.5450093415079635, "val/perplexity_len_512": 34.64000893066873}
42
+ {"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 18306.998820198, "val/train_update_time": 10465.017247352516, "val/loss": 3.3645532914318377, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 201.65959867101628, "val/val_tokens_per_second": 203114.55675770427, "val/loss_avg_len_2048": 3.3645532914318377, "val/perplexity_len_2048": 28.920575344288608, "val/loss_avg_len_1024": 3.438682607750362, "val/perplexity_len_1024": 31.14589976227984, "val/loss_avg_len_512": 3.541240479665343, "val/perplexity_len_512": 34.50970123288121}
43
+ {"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 18853.344159234082, "val/train_update_time": 10809.319413250545, "val/loss": 3.361436649635225, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 244.70953505299985, "val/val_tokens_per_second": 167382.1168886156, "val/loss_avg_len_2048": 3.361436649635225, "val/perplexity_len_2048": 28.830580583950955, "val/loss_avg_len_1024": 3.43572733581001, "val/perplexity_len_1024": 31.05399103319084, "val/loss_avg_len_512": 3.538355437440705, "val/perplexity_len_512": 34.410282769855435}
44
+ {"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 19462.212121990975, "val/train_update_time": 11172.7729124584, "val/loss": 3.3589920665140265, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 212.01162798295263, "val/val_tokens_per_second": 193196.95051487224, "val/loss_avg_len_2048": 3.3589920665140265, "val/perplexity_len_2048": 28.76018790870942, "val/loss_avg_len_1024": 3.4332265336758923, "val/perplexity_len_1024": 30.976428171250745, "val/loss_avg_len_512": 3.5358822584063745, "val/perplexity_len_512": 34.32528513045831}
45
+ {"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 20004.831071814988, "val/train_update_time": 11502.97223422851, "val/loss": 3.356962317546993, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.35607897106092, "val/val_tokens_per_second": 166263.40283980372, "val/loss_avg_len_2048": 3.356962317546993, "val/perplexity_len_2048": 28.70187115122067, "val/loss_avg_len_1024": 3.431286059721187, "val/perplexity_len_1024": 30.91637750139814, "val/loss_avg_len_512": 3.533992369709909, "val/perplexity_len_512": 34.26047542294408}
46
+ {"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 20616.578735099058, "val/train_update_time": 11866.318235529237, "val/loss": 3.3554599901208655, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 228.49143660697155, "val/val_tokens_per_second": 179262.7356554082, "val/loss_avg_len_2048": 3.3554599901208655, "val/perplexity_len_2048": 28.65878391668004, "val/loss_avg_len_1024": 3.429749476152519, "val/perplexity_len_1024": 30.868908383200168, "val/loss_avg_len_512": 3.5324522381311283, "val/perplexity_len_512": 34.20775039500344}
47
+ {"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 21166.986645585042, "val/train_update_time": 12187.849889185163, "val/loss": 3.354476233608229, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 236.16754041495733, "val/val_tokens_per_second": 173436.19672725294, "val/loss_avg_len_2048": 3.354476233608229, "val/perplexity_len_2048": 28.630604514465613, "val/loss_avg_len_1024": 3.4288002763013354, "val/perplexity_len_1024": 30.839621521697836, "val/loss_avg_len_512": 3.531493121845834, "val/perplexity_len_512": 34.174956913411336}
48
+ {"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 21767.68956994405, "val/train_update_time": 12551.347988351132, "val/loss": 3.3539066611163086, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 244.70243748801295, "val/val_tokens_per_second": 167386.97178693398, "val/loss_avg_len_2048": 3.3539066611163086, "val/perplexity_len_2048": 28.614301952893154, "val/loss_avg_len_1024": 3.4281986616883895, "val/perplexity_len_1024": 30.82107353466212, "val/loss_avg_len_512": 3.530901245925203, "val/perplexity_len_512": 34.154735564180854}
49
+ {"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 22335.74923630804, "val/train_update_time": 12874.329680579598, "val/loss": 3.353608620805643, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 220.12170270108618, "val/val_tokens_per_second": 186078.880443795, "val/loss_avg_len_2048": 3.353608620805643, "val/perplexity_len_2048": 28.60577500819937, "val/loss_avg_len_1024": 3.4279116332200825, "val/perplexity_len_1024": 30.812228278614764, "val/loss_avg_len_512": 3.530627423015889, "val/perplexity_len_512": 34.14538449544867}
metrics/jsonlines/val_data_info.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 0, "val_data_info/vocab_size": 50277, "val_data_info/global_tokens_per_batch": 2048, "val_data_info/local_tokens_per_batch": 2048, "val_data_info/batch_len": 2048, "val_data_info/seq_len": 2048, "val_data_info/total_tokens": 2147483648, "val_data_info/global_batch_size": 1, "val_data_info/local_batch_size": 1}
metrics/npz/train_eval/step-000000104857600.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6a2df6c4524c1fc852a0dcd21cc06ca765bbad8dfbe52ec454576a084431a9c
3
+ size 20540
metrics/npz/train_eval/step-000000209715200.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2de3934e9d3406a693ace41b171e525920e60d81785017ae65227ecb612cdee8
3
+ size 20540