add remote code + model files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .hydra/config.yaml +102 -0
- .hydra/hydra.yaml +146 -0
- .hydra/overrides.yaml +16 -0
- __init__.py +1 -0
- __pycache__/__init__.cpython-310.pyc +0 -0
- __pycache__/configuration_transformer.cpython-310.pyc +0 -0
- __pycache__/modeling_transformer.cpython-310.pyc +0 -0
- checkpoints/step-000000209715200.pt +3 -0
- checkpoints/step-000000209715200.pt.done +0 -0
- checkpoints/step-000000209715200.pt.keep +0 -0
- checkpoints/step-000000419430400.pt +3 -0
- checkpoints/step-000000419430400.pt.done +0 -0
- checkpoints/step-000000419430400.pt.keep +0 -0
- checkpoints/step-000000629145600.pt +3 -0
- checkpoints/step-000000629145600.pt.done +0 -0
- checkpoints/step-000000629145600.pt.keep +0 -0
- checkpoints/step-000000838860800.pt +3 -0
- checkpoints/step-000000838860800.pt.done +0 -0
- checkpoints/step-000000838860800.pt.keep +0 -0
- checkpoints/step-000001048576000.pt +3 -0
- checkpoints/step-000001048576000.pt.done +0 -0
- checkpoints/step-000001048576000.pt.keep +0 -0
- checkpoints/step-000001258291200.pt +3 -0
- checkpoints/step-000001258291200.pt.done +0 -0
- checkpoints/step-000001258291200.pt.keep +0 -0
- checkpoints/step-000001468006400.pt +3 -0
- checkpoints/step-000001468006400.pt.done +0 -0
- checkpoints/step-000001468006400.pt.keep +0 -0
- checkpoints/step-000001677721600.pt +3 -0
- checkpoints/step-000001677721600.pt.done +0 -0
- checkpoints/step-000001677721600.pt.keep +0 -0
- checkpoints/step-000001887436800.pt +3 -0
- checkpoints/step-000001887436800.pt.done +0 -0
- checkpoints/step-000001887436800.pt.keep +0 -0
- config.yaml +102 -0
- configuration_transformer.py +67 -0
- decay_params.txt +16 -0
- logs/2025-10-13_04-19-28.log +258 -0
- metrics/jsonlines/checkpoint.jsonl +9 -0
- metrics/jsonlines/model_info.jsonl +1 -0
- metrics/jsonlines/norm.jsonl +0 -0
- metrics/jsonlines/resume.jsonl +1 -0
- metrics/jsonlines/throughput.jsonl +0 -0
- metrics/jsonlines/train.jsonl +98 -0
- metrics/jsonlines/train_data_info.jsonl +1 -0
- metrics/jsonlines/train_eval.jsonl +19 -0
- metrics/jsonlines/val.jsonl +49 -0
- metrics/jsonlines/val_data_info.jsonl +1 -0
- metrics/npz/train_eval/step-000000104857600.npz +3 -0
- metrics/npz/train_eval/step-000000209715200.npz +3 -0
.hydra/config.yaml
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
_target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM
|
| 3 |
+
config:
|
| 4 |
+
_target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig
|
| 5 |
+
vocab_size: ???
|
| 6 |
+
hidden_size: 256
|
| 7 |
+
hidden_ratio: 4.0
|
| 8 |
+
intermediate_size: null
|
| 9 |
+
num_hidden_layers: 2
|
| 10 |
+
num_heads: 4
|
| 11 |
+
num_kv_heads: null
|
| 12 |
+
hidden_act: swish
|
| 13 |
+
window_size: null
|
| 14 |
+
max_position_embeddings: null
|
| 15 |
+
initializer_range: 0.02
|
| 16 |
+
elementwise_affine: true
|
| 17 |
+
norm_eps: 1.0e-06
|
| 18 |
+
use_cache: true
|
| 19 |
+
pad_token_id: null
|
| 20 |
+
bos_token_id: null
|
| 21 |
+
eos_token_id: null
|
| 22 |
+
tie_word_embeddings: false
|
| 23 |
+
attention_bias: false
|
| 24 |
+
fuse_norm: true
|
| 25 |
+
fuse_cross_entropy: true
|
| 26 |
+
rope_base: 500000.0
|
| 27 |
+
use_rope: false
|
| 28 |
+
use_output_gate: false
|
| 29 |
+
ogate_act: sigmoid
|
| 30 |
+
fgate_type: full
|
| 31 |
+
fgate_bias_init: false
|
| 32 |
+
decay_time_min: null
|
| 33 |
+
decay_time_max: null
|
| 34 |
+
use_output_norm: false
|
| 35 |
+
qk_norm: false
|
| 36 |
+
qk_norm_share_param_across_head: false
|
| 37 |
+
use_k_shift: false
|
| 38 |
+
use_v_shift: false
|
| 39 |
+
optimizer:
|
| 40 |
+
_target_: torch.optim.AdamW
|
| 41 |
+
lr: 0.001
|
| 42 |
+
betas:
|
| 43 |
+
- 0.9
|
| 44 |
+
- 0.95
|
| 45 |
+
weight_decay: 0.1
|
| 46 |
+
schedule:
|
| 47 |
+
_target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule
|
| 48 |
+
init_value: 0.0
|
| 49 |
+
peak_value: ${optimizer.lr}
|
| 50 |
+
warmup_steps: 20971520
|
| 51 |
+
decay_steps: ${train.max_tokens}
|
| 52 |
+
end_value: 0.0
|
| 53 |
+
datamodule:
|
| 54 |
+
_target_: forgetting_transformer.datamodule.npy.NpyDataModule
|
| 55 |
+
data_path: ${data_dir}
|
| 56 |
+
rank: ???
|
| 57 |
+
world_size: ???
|
| 58 |
+
train_batch_len: 2048
|
| 59 |
+
train_batch_size: 1024
|
| 60 |
+
train_num_workers: 0
|
| 61 |
+
eval_tokens: 2147483648
|
| 62 |
+
eval_batch_len: 2048
|
| 63 |
+
eval_local_batch_size: 1
|
| 64 |
+
eval_num_workers: 0
|
| 65 |
+
strategy:
|
| 66 |
+
_target_: lightning.fabric.strategies.SingleDeviceStrategy
|
| 67 |
+
device: cuda:0
|
| 68 |
+
exp: forgetting_gate_2_4_256
|
| 69 |
+
tag: forgetting_gate_2_4_256
|
| 70 |
+
seed: 42
|
| 71 |
+
hf_load_dir: null
|
| 72 |
+
hf_save_dir: null
|
| 73 |
+
hf_load_step: null
|
| 74 |
+
output_dir: ./forgetting_gate_2_4_256/
|
| 75 |
+
data_dir: /workspace/forgetting-transformer/data
|
| 76 |
+
resume: false
|
| 77 |
+
fork_dir: null
|
| 78 |
+
fork_step: null
|
| 79 |
+
log_interval: 20971520
|
| 80 |
+
eval_interval: 41943040
|
| 81 |
+
final_eval: true
|
| 82 |
+
skip_eval: false
|
| 83 |
+
checkpoint_interval: 209715200
|
| 84 |
+
train_eval_interval: 104857600
|
| 85 |
+
checkpoint_keep_interval: 209715200
|
| 86 |
+
fabric:
|
| 87 |
+
devices: 1
|
| 88 |
+
precision: 16-mixed
|
| 89 |
+
train:
|
| 90 |
+
max_tokens: 2097152000
|
| 91 |
+
grad_acc_tokens: 32768
|
| 92 |
+
max_grad_norm: 1.0
|
| 93 |
+
gradient_checkpointing: true
|
| 94 |
+
bias_weight_decay: false
|
| 95 |
+
normalization_weight_decay: false
|
| 96 |
+
conv_weight_decay: true
|
| 97 |
+
eval:
|
| 98 |
+
min_val_length: 512
|
| 99 |
+
wandb:
|
| 100 |
+
project: forgetting-transformer
|
| 101 |
+
mode: online
|
| 102 |
+
log_dir: ./output/wandb
|
.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${output_dir}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
root: null
|
| 71 |
+
disable_existing_loggers: false
|
| 72 |
+
job_logging:
|
| 73 |
+
version: 1
|
| 74 |
+
root: null
|
| 75 |
+
disable_existing_loggers: false
|
| 76 |
+
env: {}
|
| 77 |
+
mode: RUN
|
| 78 |
+
searchpath: []
|
| 79 |
+
callbacks: {}
|
| 80 |
+
output_subdir: .hydra
|
| 81 |
+
overrides:
|
| 82 |
+
hydra:
|
| 83 |
+
- hydra.mode=RUN
|
| 84 |
+
task:
|
| 85 |
+
- +experiment/pile/forgetting_transformer=forgetting_gate_2_4_256
|
| 86 |
+
- strategy=single_device
|
| 87 |
+
- datamodule=npy
|
| 88 |
+
- schedule=warmup_cosine
|
| 89 |
+
- optimizer=adamw
|
| 90 |
+
- model=forgetting_transformer
|
| 91 |
+
- data_dir=/workspace/forgetting-transformer/data
|
| 92 |
+
- fabric.devices=1
|
| 93 |
+
- fabric.precision=16-mixed
|
| 94 |
+
- seed=42
|
| 95 |
+
- exp=forgetting_gate_2_4_256
|
| 96 |
+
- tag=forgetting_gate_2_4_256
|
| 97 |
+
- output_dir=./forgetting_gate_2_4_256/
|
| 98 |
+
- wandb.log_dir=./output/wandb
|
| 99 |
+
- wandb.mode=online
|
| 100 |
+
- resume=false
|
| 101 |
+
job:
|
| 102 |
+
name: train
|
| 103 |
+
chdir: null
|
| 104 |
+
override_dirname: +experiment/pile/forgetting_transformer=forgetting_gate_2_4_256,data_dir=/workspace/forgetting-transformer/data,datamodule=npy,exp=forgetting_gate_2_4_256,fabric.devices=1,fabric.precision=16-mixed,model=forgetting_transformer,optimizer=adamw,output_dir=./forgetting_gate_2_4_256/,resume=false,schedule=warmup_cosine,seed=42,strategy=single_device,tag=forgetting_gate_2_4_256,wandb.log_dir=./output/wandb,wandb.mode=online
|
| 105 |
+
id: ???
|
| 106 |
+
num: ???
|
| 107 |
+
config_name: config
|
| 108 |
+
env_set: {}
|
| 109 |
+
env_copy: []
|
| 110 |
+
config:
|
| 111 |
+
override_dirname:
|
| 112 |
+
kv_sep: '='
|
| 113 |
+
item_sep: ','
|
| 114 |
+
exclude_keys: []
|
| 115 |
+
runtime:
|
| 116 |
+
version: 1.3.2
|
| 117 |
+
version_base: '1.3'
|
| 118 |
+
cwd: /workspace/forgetting-transformer
|
| 119 |
+
config_sources:
|
| 120 |
+
- path: hydra.conf
|
| 121 |
+
schema: pkg
|
| 122 |
+
provider: hydra
|
| 123 |
+
- path: /workspace/forgetting-transformer/configs
|
| 124 |
+
schema: file
|
| 125 |
+
provider: main
|
| 126 |
+
- path: ''
|
| 127 |
+
schema: structured
|
| 128 |
+
provider: schema
|
| 129 |
+
output_dir: /workspace/forgetting-transformer/forgetting_gate_2_4_256
|
| 130 |
+
choices:
|
| 131 |
+
experiment/pile/forgetting_transformer: forgetting_gate_2_4_256
|
| 132 |
+
strategy: single_device
|
| 133 |
+
datamodule: npy
|
| 134 |
+
schedule: warmup_cosine
|
| 135 |
+
optimizer: adamw
|
| 136 |
+
model: forgetting_transformer
|
| 137 |
+
hydra/env: default
|
| 138 |
+
hydra/callbacks: null
|
| 139 |
+
hydra/job_logging: none
|
| 140 |
+
hydra/hydra_logging: none
|
| 141 |
+
hydra/hydra_help: default
|
| 142 |
+
hydra/help: default
|
| 143 |
+
hydra/sweeper: basic
|
| 144 |
+
hydra/launcher: basic
|
| 145 |
+
hydra/output: default
|
| 146 |
+
verbose: false
|
.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- +experiment/pile/forgetting_transformer=forgetting_gate_2_4_256
|
| 2 |
+
- strategy=single_device
|
| 3 |
+
- datamodule=npy
|
| 4 |
+
- schedule=warmup_cosine
|
| 5 |
+
- optimizer=adamw
|
| 6 |
+
- model=forgetting_transformer
|
| 7 |
+
- data_dir=/workspace/forgetting-transformer/data
|
| 8 |
+
- fabric.devices=1
|
| 9 |
+
- fabric.precision=16-mixed
|
| 10 |
+
- seed=42
|
| 11 |
+
- exp=forgetting_gate_2_4_256
|
| 12 |
+
- tag=forgetting_gate_2_4_256
|
| 13 |
+
- output_dir=./forgetting_gate_2_4_256/
|
| 14 |
+
- wandb.log_dir=./output/wandb
|
| 15 |
+
- wandb.mode=online
|
| 16 |
+
- resume=false
|
__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# for HF remote code
|
__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (549 Bytes). View file
|
|
|
__pycache__/configuration_transformer.cpython-310.pyc
ADDED
|
Binary file (1.99 kB). View file
|
|
|
__pycache__/modeling_transformer.cpython-310.pyc
ADDED
|
Binary file (15.2 kB). View file
|
|
|
checkpoints/step-000000209715200.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:236c34edce0f811562e4cd1b530249c5cf4d9fdb512e24b0b97bf259dbc1998e
|
| 3 |
+
size 329435138
|
checkpoints/step-000000209715200.pt.done
ADDED
|
File without changes
|
checkpoints/step-000000209715200.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000000419430400.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5df232ebcfcd25a99dcd36e6570530238148ed72180e864a4ead2122836f2ed
|
| 3 |
+
size 329435138
|
checkpoints/step-000000419430400.pt.done
ADDED
|
File without changes
|
checkpoints/step-000000419430400.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000000629145600.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:207706e754abd9271e6622c358b1a436015fd842dce3c12e836cb277ae7c08c1
|
| 3 |
+
size 329435138
|
checkpoints/step-000000629145600.pt.done
ADDED
|
File without changes
|
checkpoints/step-000000629145600.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000000838860800.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e48c509f4a5f1b180e709297ef3e0588d0ba83b6e6d386b55467f408f491dec6
|
| 3 |
+
size 329435138
|
checkpoints/step-000000838860800.pt.done
ADDED
|
File without changes
|
checkpoints/step-000000838860800.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000001048576000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ce74d91ba505dca661cfacf0585880e17a1d9490fee076fb64c72348ddd1a65
|
| 3 |
+
size 329435138
|
checkpoints/step-000001048576000.pt.done
ADDED
|
File without changes
|
checkpoints/step-000001048576000.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000001258291200.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75567783b278f9fa3d3cc02888962f3f373895ab53304e35338a83bda3aa6460
|
| 3 |
+
size 329435138
|
checkpoints/step-000001258291200.pt.done
ADDED
|
File without changes
|
checkpoints/step-000001258291200.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000001468006400.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9038cf9d54a8c8003344e57e6cf85164e59317b0a0a1411e2cac8bd6dab4169
|
| 3 |
+
size 329435138
|
checkpoints/step-000001468006400.pt.done
ADDED
|
File without changes
|
checkpoints/step-000001468006400.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000001677721600.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f044b1ff3f9e99f65fcfcda0cc2239765b4e1d71f92ba0aa50b7b0473030001
|
| 3 |
+
size 329435138
|
checkpoints/step-000001677721600.pt.done
ADDED
|
File without changes
|
checkpoints/step-000001677721600.pt.keep
ADDED
|
File without changes
|
checkpoints/step-000001887436800.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8dde4cb869035248b40f163b7e3ad4ba9b683e1f6432ff1a706b42bf1e173c3
|
| 3 |
+
size 329435138
|
checkpoints/step-000001887436800.pt.done
ADDED
|
File without changes
|
checkpoints/step-000001887436800.pt.keep
ADDED
|
File without changes
|
config.yaml
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
_target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM
|
| 3 |
+
config:
|
| 4 |
+
_target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig
|
| 5 |
+
vocab_size: ???
|
| 6 |
+
hidden_size: 256
|
| 7 |
+
hidden_ratio: 4.0
|
| 8 |
+
intermediate_size: null
|
| 9 |
+
num_hidden_layers: 2
|
| 10 |
+
num_heads: 4
|
| 11 |
+
num_kv_heads: null
|
| 12 |
+
hidden_act: swish
|
| 13 |
+
window_size: null
|
| 14 |
+
max_position_embeddings: null
|
| 15 |
+
initializer_range: 0.02
|
| 16 |
+
elementwise_affine: true
|
| 17 |
+
norm_eps: 1.0e-06
|
| 18 |
+
use_cache: true
|
| 19 |
+
pad_token_id: null
|
| 20 |
+
bos_token_id: null
|
| 21 |
+
eos_token_id: null
|
| 22 |
+
tie_word_embeddings: false
|
| 23 |
+
attention_bias: false
|
| 24 |
+
fuse_norm: true
|
| 25 |
+
fuse_cross_entropy: true
|
| 26 |
+
rope_base: 500000.0
|
| 27 |
+
use_rope: false
|
| 28 |
+
use_output_gate: false
|
| 29 |
+
ogate_act: sigmoid
|
| 30 |
+
fgate_type: full
|
| 31 |
+
fgate_bias_init: false
|
| 32 |
+
decay_time_min: null
|
| 33 |
+
decay_time_max: null
|
| 34 |
+
use_output_norm: false
|
| 35 |
+
qk_norm: false
|
| 36 |
+
qk_norm_share_param_across_head: false
|
| 37 |
+
use_k_shift: false
|
| 38 |
+
use_v_shift: false
|
| 39 |
+
optimizer:
|
| 40 |
+
_target_: torch.optim.AdamW
|
| 41 |
+
lr: 0.001
|
| 42 |
+
betas:
|
| 43 |
+
- 0.9
|
| 44 |
+
- 0.95
|
| 45 |
+
weight_decay: 0.1
|
| 46 |
+
schedule:
|
| 47 |
+
_target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule
|
| 48 |
+
init_value: 0.0
|
| 49 |
+
peak_value: 0.001
|
| 50 |
+
warmup_steps: 20971520
|
| 51 |
+
decay_steps: 2097152000
|
| 52 |
+
end_value: 0.0
|
| 53 |
+
datamodule:
|
| 54 |
+
_target_: forgetting_transformer.datamodule.npy.NpyDataModule
|
| 55 |
+
data_path: /workspace/forgetting-transformer/data
|
| 56 |
+
rank: ???
|
| 57 |
+
world_size: ???
|
| 58 |
+
train_batch_len: 2048
|
| 59 |
+
train_batch_size: 1024
|
| 60 |
+
train_num_workers: 0
|
| 61 |
+
eval_tokens: 2147483648
|
| 62 |
+
eval_batch_len: 2048
|
| 63 |
+
eval_local_batch_size: 1
|
| 64 |
+
eval_num_workers: 0
|
| 65 |
+
strategy:
|
| 66 |
+
_target_: lightning.fabric.strategies.SingleDeviceStrategy
|
| 67 |
+
device: cuda:0
|
| 68 |
+
exp: forgetting_gate_2_4_256
|
| 69 |
+
tag: forgetting_gate_2_4_256
|
| 70 |
+
seed: 42
|
| 71 |
+
hf_load_dir: null
|
| 72 |
+
hf_save_dir: null
|
| 73 |
+
hf_load_step: null
|
| 74 |
+
output_dir: /workspace/forgetting-transformer/forgetting_gate_2_4_256
|
| 75 |
+
data_dir: /workspace/forgetting-transformer/data
|
| 76 |
+
resume: false
|
| 77 |
+
fork_dir: null
|
| 78 |
+
fork_step: null
|
| 79 |
+
log_interval: 20971520
|
| 80 |
+
eval_interval: 41943040
|
| 81 |
+
final_eval: true
|
| 82 |
+
skip_eval: false
|
| 83 |
+
checkpoint_interval: 209715200
|
| 84 |
+
train_eval_interval: 104857600
|
| 85 |
+
checkpoint_keep_interval: 209715200
|
| 86 |
+
fabric:
|
| 87 |
+
devices: 1
|
| 88 |
+
precision: 16-mixed
|
| 89 |
+
train:
|
| 90 |
+
max_tokens: 2097152000
|
| 91 |
+
grad_acc_tokens: 32768
|
| 92 |
+
max_grad_norm: 1.0
|
| 93 |
+
gradient_checkpointing: true
|
| 94 |
+
bias_weight_decay: false
|
| 95 |
+
normalization_weight_decay: false
|
| 96 |
+
conv_weight_decay: true
|
| 97 |
+
eval:
|
| 98 |
+
min_val_length: 512
|
| 99 |
+
wandb:
|
| 100 |
+
project: forgetting-transformer
|
| 101 |
+
mode: online
|
| 102 |
+
log_dir: ./output/wandb
|
configuration_transformer.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TransformerConfig(PretrainedConfig):
|
| 9 |
+
|
| 10 |
+
model_type = 'transformer-project_fox'
|
| 11 |
+
keys_to_ignore_at_inference = ['past_key_values']
|
| 12 |
+
|
| 13 |
+
def __init__(
|
| 14 |
+
self,
|
| 15 |
+
vocab_size: int = 32000,
|
| 16 |
+
hidden_size: int = 2048,
|
| 17 |
+
hidden_ratio: Optional[int] = 4,
|
| 18 |
+
intermediate_size: Optional[int] = None,
|
| 19 |
+
num_hidden_layers: int = 24,
|
| 20 |
+
num_heads: int = 32,
|
| 21 |
+
num_kv_heads: int = None,
|
| 22 |
+
hidden_act: str = "swish",
|
| 23 |
+
window_size: Optional[int] = None,
|
| 24 |
+
max_position_embeddings: int = 2048,
|
| 25 |
+
initializer_range: float = 0.02,
|
| 26 |
+
elementwise_affine: Optional[bool] = True,
|
| 27 |
+
norm_eps: float = 1e-6,
|
| 28 |
+
use_cache: bool = True,
|
| 29 |
+
pad_token_id: int = None,
|
| 30 |
+
bos_token_id: int = 1,
|
| 31 |
+
eos_token_id: int = 2,
|
| 32 |
+
tie_word_embeddings: bool = False,
|
| 33 |
+
attention_bias: bool = False,
|
| 34 |
+
fuse_norm: bool = True,
|
| 35 |
+
fuse_cross_entropy: bool = True,
|
| 36 |
+
rope_base: float = 500000.0,
|
| 37 |
+
use_rope: bool = True,
|
| 38 |
+
**kwargs,
|
| 39 |
+
):
|
| 40 |
+
self.vocab_size = vocab_size
|
| 41 |
+
self.hidden_size = hidden_size
|
| 42 |
+
self.hidden_ratio = hidden_ratio
|
| 43 |
+
self.intermediate_size = intermediate_size
|
| 44 |
+
self.num_hidden_layers = num_hidden_layers
|
| 45 |
+
self.num_heads = num_heads
|
| 46 |
+
self.num_kv_heads = num_kv_heads
|
| 47 |
+
self.window_size = window_size
|
| 48 |
+
self.max_position_embeddings = max_position_embeddings
|
| 49 |
+
|
| 50 |
+
self.hidden_act = hidden_act
|
| 51 |
+
self.initializer_range = initializer_range
|
| 52 |
+
self.elementwise_affine = elementwise_affine
|
| 53 |
+
self.norm_eps = norm_eps
|
| 54 |
+
self.use_cache = use_cache
|
| 55 |
+
self.attention_bias = attention_bias
|
| 56 |
+
self.fuse_cross_entropy = fuse_cross_entropy
|
| 57 |
+
self.fuse_norm = fuse_norm
|
| 58 |
+
self.rope_base = rope_base
|
| 59 |
+
self.use_rope = use_rope
|
| 60 |
+
|
| 61 |
+
super().__init__(
|
| 62 |
+
pad_token_id=pad_token_id,
|
| 63 |
+
bos_token_id=bos_token_id,
|
| 64 |
+
eos_token_id=eos_token_id,
|
| 65 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 66 |
+
**kwargs,
|
| 67 |
+
)
|
decay_params.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_forward_module.model.embeddings.weight
|
| 2 |
+
_forward_module.model.layers.0.attn.q_proj.weight
|
| 3 |
+
_forward_module.model.layers.0.attn.k_proj.weight
|
| 4 |
+
_forward_module.model.layers.0.attn.v_proj.weight
|
| 5 |
+
_forward_module.model.layers.0.attn.o_proj.weight
|
| 6 |
+
_forward_module.model.layers.0.attn.fgate_proj.weight
|
| 7 |
+
_forward_module.model.layers.0.mlp.gate_proj.weight
|
| 8 |
+
_forward_module.model.layers.0.mlp.down_proj.weight
|
| 9 |
+
_forward_module.model.layers.1.attn.q_proj.weight
|
| 10 |
+
_forward_module.model.layers.1.attn.k_proj.weight
|
| 11 |
+
_forward_module.model.layers.1.attn.v_proj.weight
|
| 12 |
+
_forward_module.model.layers.1.attn.o_proj.weight
|
| 13 |
+
_forward_module.model.layers.1.attn.fgate_proj.weight
|
| 14 |
+
_forward_module.model.layers.1.mlp.gate_proj.weight
|
| 15 |
+
_forward_module.model.layers.1.mlp.down_proj.weight
|
| 16 |
+
_forward_module.lm_head.weight
|
logs/2025-10-13_04-19-28.log
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-10-13 04:19:29][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/forgetting_gate_2_4_256`
|
| 2 |
+
[2025-10-13 04:19:29][train:375][INFO] Configuration:
|
| 3 |
+
[2025-10-13 04:19:29][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/config.yaml.
|
| 4 |
+
[2025-10-13 04:19:29][train:387][INFO] creating datamodule
|
| 5 |
+
[2025-10-13 04:19:29][train:419][INFO] creating model
|
| 6 |
+
[2025-10-13 04:19:29][train:440][INFO] creating optimizer
|
| 7 |
+
[2025-10-13 04:19:29][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
|
| 8 |
+
[2025-10-13 04:19:29][logger:256][INFO] Setting up wandb logger...
|
| 9 |
+
[2025-10-13 04:19:29][logger:272][INFO] Not resuming. Creating a new wandb run.
|
| 10 |
+
[2025-10-13 04:19:30][logger:288][INFO] wandb initialized. Run id: 0m6wmz8p
|
| 11 |
+
[2025-10-13 04:19:30][logger:186][INFO] Setting up jsonlines logger...
|
| 12 |
+
[2025-10-13 04:19:30][logger:113][INFO] Setting up npz logger...
|
| 13 |
+
[2025-10-13 04:19:30][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
|
| 14 |
+
[2025-10-13 04:19:30][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
|
| 15 |
+
[2025-10-13 04:19:30][logger:171][INFO] [step: 0] [model_info/total_params: 27449096] [model_info/trainable_params: 27449096] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14578184]
|
| 16 |
+
[2025-10-13 04:20:12][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:41] [ETA: 1:09:15] [loss: 9.770] [tokens/s: 591486.463] [batches/s: 0.282] [MFU: 0.000] [TFLOPS: 0.000]
|
| 17 |
+
[2025-10-13 04:20:47][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:17] [ETA: 1:03:10] [loss: 8.079] [tokens/s: 592065.333] [batches/s: 0.282] [MFU: 0.000] [TFLOPS: 0.000]
|
| 18 |
+
[2025-10-13 04:20:47][train:194][INFO] Running validation...
|
| 19 |
+
[2025-10-13 04:22:03][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 77.367] [val/train_update_time: 76.988] [val/loss: 7.967] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.470] [val/val_tokens_per_second: 542734.371] [val/loss_avg_len_2048: 7.967] [val/perplexity_len_2048: 2885.363] [val/loss_avg_len_1024: 7.966] [val/perplexity_len_1024: 2880.986] [val/loss_avg_len_512: 7.966] [val/perplexity_len_512: 2881.145]
|
| 20 |
+
[2025-10-13 04:22:38][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:03:08] [ETA: 1:41:26] [loss: 7.445] [tokens/s: 341356.838] [batches/s: 0.163] [MFU: 0.000] [TFLOPS: 0.000]
|
| 21 |
+
[2025-10-13 04:23:13][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:03:43] [ETA: 1:29:26] [loss: 7.106] [tokens/s: 382999.413] [batches/s: 0.183] [MFU: 0.000] [TFLOPS: 0.000]
|
| 22 |
+
[2025-10-13 04:23:13][train:194][INFO] Running validation...
|
| 23 |
+
[2025-10-13 04:24:28][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 223.616] [val/train_update_time: 147.446] [val/loss: 7.078] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.524] [val/val_tokens_per_second: 549622.163] [val/loss_avg_len_2048: 7.078] [val/perplexity_len_2048: 1185.598] [val/loss_avg_len_1024: 7.077] [val/perplexity_len_1024: 1184.110] [val/loss_avg_len_512: 7.078] [val/perplexity_len_512: 1185.392]
|
| 24 |
+
[2025-10-13 04:25:03][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:05:33] [ETA: 1:45:36] [loss: 6.857] [tokens/s: 317696.511] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000]
|
| 25 |
+
[2025-10-13 04:25:03][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 333.522] [train_eval/train_update_time: 182.681] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.219] [train_eval/perplexity_len_2048: 3709.494] [train_eval/loss_avg_len_1024: 8.219] [train_eval/perplexity_len_1024: 3710.807] [train_eval/loss_avg_len_512: 8.219] [train_eval/perplexity_len_512: 3709.141]
|
| 26 |
+
[2025-10-13 04:25:39][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:06:08] [ETA: 1:36:19] [loss: 6.597] [tokens/s: 344814.904] [batches/s: 0.164] [MFU: 0.000] [TFLOPS: 0.000]
|
| 27 |
+
[2025-10-13 04:25:39][train:194][INFO] Running validation...
|
| 28 |
+
[2025-10-13 04:26:53][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 368.903] [val/train_update_time: 217.910] [val/loss: 6.593] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.573] [val/val_tokens_per_second: 549260.699] [val/loss_avg_len_2048: 6.593] [val/perplexity_len_2048: 730.224] [val/loss_avg_len_1024: 6.592] [val/perplexity_len_1024: 729.509] [val/loss_avg_len_512: 6.595] [val/perplexity_len_512: 731.117]
|
| 29 |
+
[2025-10-13 04:27:29][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:07:58] [ETA: 1:46:02] [loss: 6.423] [tokens/s: 308671.490] [batches/s: 0.147] [MFU: 0.000] [TFLOPS: 0.000]
|
| 30 |
+
[2025-10-13 04:28:04][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:08:34] [ETA: 1:38:33] [loss: 6.240] [tokens/s: 328605.049] [batches/s: 0.157] [MFU: 0.000] [TFLOPS: 0.000]
|
| 31 |
+
[2025-10-13 04:28:04][train:194][INFO] Running validation...
|
| 32 |
+
[2025-10-13 04:29:18][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 514.244] [val/train_update_time: 288.362] [val/loss: 6.212] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.470] [val/val_tokens_per_second: 550023.731] [val/loss_avg_len_2048: 6.212] [val/perplexity_len_2048: 498.778] [val/loss_avg_len_1024: 6.212] [val/perplexity_len_1024: 498.754] [val/loss_avg_len_512: 6.216] [val/perplexity_len_512: 500.573]
|
| 33 |
+
[2025-10-13 04:29:54][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:10:24] [ETA: 1:45:10] [loss: 6.051] [tokens/s: 303973.820] [batches/s: 0.145] [MFU: 0.000] [TFLOPS: 0.000]
|
| 34 |
+
[2025-10-13 04:30:29][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:10:59] [ETA: 1:38:55] [loss: 5.919] [tokens/s: 319710.612] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 35 |
+
[2025-10-13 04:30:29][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 659.461] [train_eval/train_update_time: 358.814] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.337] [train_eval/perplexity_len_2048: 565.198] [train_eval/loss_avg_len_1024: 6.339] [train_eval/perplexity_len_1024: 566.417] [train_eval/loss_avg_len_512: 6.342] [train_eval/perplexity_len_512: 567.789]
|
| 36 |
+
[2025-10-13 04:30:29][train:194][INFO] Running validation...
|
| 37 |
+
[2025-10-13 04:31:44][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 659.461] [val/train_update_time: 358.814] [val/loss: 5.906] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.661] [val/val_tokens_per_second: 548616.239] [val/loss_avg_len_2048: 5.906] [val/perplexity_len_2048: 367.361] [val/loss_avg_len_1024: 5.908] [val/perplexity_len_1024: 367.914] [val/loss_avg_len_512: 5.914] [val/perplexity_len_512: 370.216]
|
| 38 |
+
[2025-10-13 04:31:44][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000209715200.pt...
|
| 39 |
+
[2025-10-13 04:31:44][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000209715200.pt.
|
| 40 |
+
[2025-10-13 04:31:44][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.638]
|
| 41 |
+
[2025-10-13 04:32:20][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:12:50] [ETA: 1:43:51] [loss: 5.785] [tokens/s: 286519.111] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 42 |
+
[2025-10-13 04:32:55][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:13:25] [ETA: 1:38:27] [loss: 5.645] [tokens/s: 319846.018] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 43 |
+
[2025-10-13 04:32:55][train:194][INFO] Running validation...
|
| 44 |
+
[2025-10-13 04:34:10][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 805.506] [val/train_update_time: 429.270] [val/loss: 5.654] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.211] [val/val_tokens_per_second: 544602.455] [val/loss_avg_len_2048: 5.654] [val/perplexity_len_2048: 285.535] [val/loss_avg_len_1024: 5.657] [val/perplexity_len_1024: 286.418] [val/loss_avg_len_512: 5.666] [val/perplexity_len_512: 288.923]
|
| 45 |
+
[2025-10-13 04:34:46][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:15:16] [ETA: 1:42:10] [loss: 5.578] [tokens/s: 286634.761] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 46 |
+
[2025-10-13 04:35:21][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:15:51] [ETA: 1:37:24] [loss: 5.487] [tokens/s: 319513.522] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 47 |
+
[2025-10-13 04:35:21][train:194][INFO] Running validation...
|
| 48 |
+
[2025-10-13 04:36:36][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 951.478] [val/train_update_time: 499.734] [val/loss: 5.471] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.619] [val/val_tokens_per_second: 548918.360] [val/loss_avg_len_2048: 5.471] [val/perplexity_len_2048: 237.717] [val/loss_avg_len_1024: 5.476] [val/perplexity_len_1024: 238.770] [val/loss_avg_len_512: 5.486] [val/perplexity_len_512: 241.317]
|
| 49 |
+
[2025-10-13 04:37:11][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:17:41] [ETA: 1:40:15] [loss: 5.374] [tokens/s: 286597.035] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 50 |
+
[2025-10-13 04:37:11][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1061.491] [train_eval/train_update_time: 534.962] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.623] [train_eval/perplexity_len_2048: 276.745] [train_eval/loss_avg_len_1024: 5.627] [train_eval/perplexity_len_1024: 277.857] [train_eval/loss_avg_len_512: 5.635] [train_eval/perplexity_len_512: 280.049]
|
| 51 |
+
[2025-10-13 04:37:47][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:18:16] [ETA: 1:35:58] [loss: 5.322] [tokens/s: 319485.024] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 52 |
+
[2025-10-13 04:37:47][train:194][INFO] Running validation...
|
| 53 |
+
[2025-10-13 04:39:01][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1096.876] [val/train_update_time: 570.191] [val/loss: 5.306] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.726] [val/val_tokens_per_second: 548136.181] [val/loss_avg_len_2048: 5.306] [val/perplexity_len_2048: 201.613] [val/loss_avg_len_1024: 5.312] [val/perplexity_len_1024: 202.759] [val/loss_avg_len_512: 5.324] [val/perplexity_len_512: 205.275]
|
| 54 |
+
[2025-10-13 04:39:37][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:20:06] [ETA: 1:38:12] [loss: 5.217] [tokens/s: 286535.928] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 55 |
+
[2025-10-13 04:40:12][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:20:42] [ETA: 1:34:19] [loss: 5.194] [tokens/s: 319360.960] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 56 |
+
[2025-10-13 04:40:12][train:194][INFO] Running validation...
|
| 57 |
+
[2025-10-13 04:41:27][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1242.364] [val/train_update_time: 640.650] [val/loss: 5.185] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.069] [val/val_tokens_per_second: 545629.639] [val/loss_avg_len_2048: 5.185] [val/perplexity_len_2048: 178.495] [val/loss_avg_len_1024: 5.191] [val/perplexity_len_1024: 179.712] [val/loss_avg_len_512: 5.206] [val/perplexity_len_512: 182.304]
|
| 58 |
+
[2025-10-13 04:42:03][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:22:32] [ETA: 1:36:07] [loss: 5.164] [tokens/s: 286291.562] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 59 |
+
[2025-10-13 04:42:38][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:23:08] [ETA: 1:32:32] [loss: 5.069] [tokens/s: 319462.979] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 60 |
+
[2025-10-13 04:42:38][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1388.207] [train_eval/train_update_time: 711.111] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.218] [train_eval/perplexity_len_2048: 184.590] [train_eval/loss_avg_len_1024: 5.225] [train_eval/perplexity_len_1024: 185.920] [train_eval/loss_avg_len_512: 5.238] [train_eval/perplexity_len_512: 188.286]
|
| 61 |
+
[2025-10-13 04:42:38][train:194][INFO] Running validation...
|
| 62 |
+
[2025-10-13 04:43:52][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1388.207] [val/train_update_time: 711.111] [val/loss: 5.070] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.477] [val/val_tokens_per_second: 549967.363] [val/loss_avg_len_2048: 5.070] [val/perplexity_len_2048: 159.104] [val/loss_avg_len_1024: 5.078] [val/perplexity_len_1024: 160.392] [val/loss_avg_len_512: 5.094] [val/perplexity_len_512: 163.046]
|
| 63 |
+
[2025-10-13 04:43:52][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000419430400.pt...
|
| 64 |
+
[2025-10-13 04:43:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000419430400.pt.
|
| 65 |
+
[2025-10-13 04:43:53][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.627]
|
| 66 |
+
[2025-10-13 04:44:28][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:24:58] [ETA: 1:33:57] [loss: 5.027] [tokens/s: 286360.798] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 67 |
+
[2025-10-13 04:45:04][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:25:34] [ETA: 1:30:39] [loss: 4.985] [tokens/s: 319487.139] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 68 |
+
[2025-10-13 04:45:04][train:194][INFO] Running validation...
|
| 69 |
+
[2025-10-13 04:46:19][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 1534.112] [val/train_update_time: 781.583] [val/loss: 4.976] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.624] [val/val_tokens_per_second: 541624.450] [val/loss_avg_len_2048: 4.976] [val/perplexity_len_2048: 144.828] [val/loss_avg_len_1024: 4.985] [val/perplexity_len_1024: 146.184] [val/loss_avg_len_512: 5.003] [val/perplexity_len_512: 148.868]
|
| 70 |
+
[2025-10-13 04:46:55][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:27:25] [ETA: 1:31:47] [loss: 4.932] [tokens/s: 286176.366] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 71 |
+
[2025-10-13 04:47:30][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:28:00] [ETA: 1:28:41] [loss: 4.875] [tokens/s: 318991.880] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 72 |
+
[2025-10-13 04:47:30][train:194][INFO] Running validation...
|
| 73 |
+
[2025-10-13 04:48:47][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 1680.500] [val/train_update_time: 852.097] [val/loss: 4.895] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 77.033] [val/val_tokens_per_second: 531719.492] [val/loss_avg_len_2048: 4.895] [val/perplexity_len_2048: 133.679] [val/loss_avg_len_1024: 4.906] [val/perplexity_len_1024: 135.103] [val/loss_avg_len_512: 4.926] [val/perplexity_len_512: 137.869]
|
| 74 |
+
[2025-10-13 04:49:23][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:29:52] [ETA: 1:29:38] [loss: 4.870] [tokens/s: 285233.108] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 75 |
+
[2025-10-13 04:49:23][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1792.928] [train_eval/train_update_time: 887.347] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.962] [train_eval/perplexity_len_2048: 142.942] [train_eval/loss_avg_len_1024: 4.970] [train_eval/perplexity_len_1024: 144.043] [train_eval/loss_avg_len_512: 4.987] [train_eval/perplexity_len_512: 146.503]
|
| 76 |
+
[2025-10-13 04:49:58][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:30:28] [ETA: 1:26:43] [loss: 4.817] [tokens/s: 317860.781] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 77 |
+
[2025-10-13 04:49:58][train:194][INFO] Running validation...
|
| 78 |
+
[2025-10-13 04:51:15][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 1828.323] [val/train_update_time: 922.602] [val/loss: 4.820] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 76.827] [val/val_tokens_per_second: 533145.204] [val/loss_avg_len_2048: 4.820] [val/perplexity_len_2048: 123.911] [val/loss_avg_len_1024: 4.831] [val/perplexity_len_1024: 125.385] [val/loss_avg_len_512: 4.854] [val/perplexity_len_512: 128.238]
|
| 79 |
+
[2025-10-13 04:51:50][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:32:20] [ETA: 1:27:26] [loss: 4.806] [tokens/s: 284395.974] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 80 |
+
[2025-10-13 04:52:26][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:32:55] [ETA: 1:24:41] [loss: 4.739] [tokens/s: 316987.083] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000]
|
| 81 |
+
[2025-10-13 04:52:26][train:194][INFO] Running validation...
|
| 82 |
+
[2025-10-13 04:53:42][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 1975.955] [val/train_update_time: 993.123] [val/loss: 4.752] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 76.377] [val/val_tokens_per_second: 536286.449] [val/loss_avg_len_2048: 4.752] [val/perplexity_len_2048: 115.836] [val/loss_avg_len_1024: 4.766] [val/perplexity_len_1024: 117.399] [val/loss_avg_len_512: 4.791] [val/perplexity_len_512: 120.371]
|
| 83 |
+
[2025-10-13 04:54:17][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:34:47] [ETA: 1:25:11] [loss: 4.711] [tokens/s: 283879.059] [batches/s: 0.135] [MFU: 0.000] [TFLOPS: 0.000]
|
| 84 |
+
[2025-10-13 04:54:53][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:35:23] [ETA: 1:22:33] [loss: 4.675] [tokens/s: 316371.005] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000]
|
| 85 |
+
[2025-10-13 04:54:53][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2123.110] [train_eval/train_update_time: 1063.612] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.779] [train_eval/perplexity_len_2048: 118.961] [train_eval/loss_avg_len_1024: 4.789] [train_eval/perplexity_len_1024: 120.142] [train_eval/loss_avg_len_512: 4.811] [train_eval/perplexity_len_512: 122.795]
|
| 86 |
+
[2025-10-13 04:54:53][train:194][INFO] Running validation...
|
| 87 |
+
[2025-10-13 04:56:08][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 2123.110] [val/train_update_time: 1063.612] [val/loss: 4.689] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.720] [val/val_tokens_per_second: 548182.963] [val/loss_avg_len_2048: 4.689] [val/perplexity_len_2048: 108.789] [val/loss_avg_len_1024: 4.705] [val/perplexity_len_1024: 110.479] [val/loss_avg_len_512: 4.733] [val/perplexity_len_512: 113.677]
|
| 88 |
+
[2025-10-13 04:56:08][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000629145600.pt...
|
| 89 |
+
[2025-10-13 04:56:08][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000629145600.pt.
|
| 90 |
+
[2025-10-13 04:56:08][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.611]
|
| 91 |
+
[2025-10-13 04:56:44][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:37:13] [ETA: 1:22:52] [loss: 4.688] [tokens/s: 283793.798] [batches/s: 0.135] [MFU: 0.000] [TFLOPS: 0.000]
|
| 92 |
+
[2025-10-13 04:57:19][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:37:49] [ETA: 1:20:22] [loss: 4.614] [tokens/s: 316521.289] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000]
|
| 93 |
+
[2025-10-13 04:57:19][train:194][INFO] Running validation...
|
| 94 |
+
[2025-10-13 04:58:34][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 2269.221] [val/train_update_time: 1134.096] [val/loss: 4.627] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.625] [val/val_tokens_per_second: 548879.706] [val/loss_avg_len_2048: 4.627] [val/perplexity_len_2048: 102.171] [val/loss_avg_len_1024: 4.645] [val/perplexity_len_1024: 104.022] [val/loss_avg_len_512: 4.677] [val/perplexity_len_512: 107.467]
|
| 95 |
+
[2025-10-13 04:59:09][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:39:39] [ETA: 1:20:30] [loss: 4.624] [tokens/s: 284187.048] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 96 |
+
[2025-10-13 04:59:44][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:40:14] [ETA: 1:18:07] [loss: 4.571] [tokens/s: 317673.799] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000]
|
| 97 |
+
[2025-10-13 04:59:44][train:194][INFO] Running validation...
|
| 98 |
+
[2025-10-13 05:01:00][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 2414.640] [val/train_update_time: 1204.582] [val/loss: 4.561] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.265] [val/val_tokens_per_second: 544207.657] [val/loss_avg_len_2048: 4.561] [val/perplexity_len_2048: 95.712] [val/loss_avg_len_1024: 4.583] [val/perplexity_len_1024: 97.802] [val/loss_avg_len_512: 4.621] [val/perplexity_len_512: 101.627]
|
| 99 |
+
[2025-10-13 05:01:35][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:42:05] [ETA: 1:18:09] [loss: 4.534] [tokens/s: 284873.417] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 100 |
+
[2025-10-13 05:01:35][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2525.280] [train_eval/train_update_time: 1239.831] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.612] [train_eval/perplexity_len_2048: 100.698] [train_eval/loss_avg_len_1024: 4.630] [train_eval/perplexity_len_1024: 102.494] [train_eval/loss_avg_len_512: 4.663] [train_eval/perplexity_len_512: 105.901]
|
| 101 |
+
[2025-10-13 05:02:10][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 0:42:40] [ETA: 1:15:52] [loss: 4.454] [tokens/s: 318435.270] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 102 |
+
[2025-10-13 05:02:10][train:194][INFO] Running validation...
|
| 103 |
+
[2025-10-13 05:03:25][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 2560.697] [val/train_update_time: 1275.092] [val/loss: 4.491] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.286] [val/val_tokens_per_second: 551381.465] [val/loss_avg_len_2048: 4.491] [val/perplexity_len_2048: 89.249] [val/loss_avg_len_1024: 4.518] [val/perplexity_len_1024: 91.636] [val/loss_avg_len_512: 4.563] [val/perplexity_len_512: 95.851]
|
| 104 |
+
[2025-10-13 05:04:00][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 0:44:30] [ETA: 1:15:46] [loss: 4.472] [tokens/s: 285864.892] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 105 |
+
[2025-10-13 05:04:35][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 0:45:05] [ETA: 1:13:34] [loss: 4.423] [tokens/s: 319466.094] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 106 |
+
[2025-10-13 05:04:35][train:194][INFO] Running validation...
|
| 107 |
+
[2025-10-13 05:05:50][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 2705.771] [val/train_update_time: 1345.541] [val/loss: 4.431] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.472] [val/val_tokens_per_second: 550003.693] [val/loss_avg_len_2048: 4.431] [val/perplexity_len_2048: 84.013] [val/loss_avg_len_1024: 4.462] [val/perplexity_len_1024: 86.673] [val/loss_avg_len_512: 4.513] [val/perplexity_len_512: 91.239]
|
| 108 |
+
[2025-10-13 05:06:25][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 0:46:55] [ETA: 1:13:23] [loss: 4.414] [tokens/s: 286621.674] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 109 |
+
[2025-10-13 05:07:01][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 0:47:31] [ETA: 1:11:16] [loss: 4.290] [tokens/s: 319887.675] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 110 |
+
[2025-10-13 05:07:01][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2851.025] [train_eval/train_update_time: 1415.981] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.448] [train_eval/perplexity_len_2048: 85.459] [train_eval/loss_avg_len_1024: 4.474] [train_eval/perplexity_len_1024: 87.732] [train_eval/loss_avg_len_512: 4.523] [train_eval/perplexity_len_512: 92.139]
|
| 111 |
+
[2025-10-13 05:07:01][train:194][INFO] Running validation...
|
| 112 |
+
[2025-10-13 05:08:15][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 2851.025] [val/train_update_time: 1415.981] [val/loss: 4.366] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.159] [val/val_tokens_per_second: 552326.836] [val/loss_avg_len_2048: 4.366] [val/perplexity_len_2048: 78.753] [val/loss_avg_len_1024: 4.403] [val/perplexity_len_1024: 81.672] [val/loss_avg_len_512: 4.461] [val/perplexity_len_512: 86.590]
|
| 113 |
+
[2025-10-13 05:08:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000838860800.pt...
|
| 114 |
+
[2025-10-13 05:08:16][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000838860800.pt.
|
| 115 |
+
[2025-10-13 05:08:16][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.666]
|
| 116 |
+
[2025-10-13 05:08:51][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 0:49:21] [ETA: 1:11:01] [loss: 4.317] [tokens/s: 286820.189] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 117 |
+
[2025-10-13 05:09:26][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 0:49:56] [ETA: 1:08:58] [loss: 4.294] [tokens/s: 319788.327] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 118 |
+
[2025-10-13 05:09:26][train:194][INFO] Running validation...
|
| 119 |
+
[2025-10-13 05:10:42][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 2996.631] [val/train_update_time: 1486.429] [val/loss: 4.308] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.161] [val/val_tokens_per_second: 544965.835] [val/loss_avg_len_2048: 4.308] [val/perplexity_len_2048: 74.259] [val/loss_avg_len_1024: 4.348] [val/perplexity_len_1024: 77.353] [val/loss_avg_len_512: 4.412] [val/perplexity_len_512: 82.458]
|
| 120 |
+
[2025-10-13 05:11:17][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 0:51:47] [ETA: 1:08:38] [loss: 4.289] [tokens/s: 286605.159] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 121 |
+
[2025-10-13 05:11:52][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 0:52:22] [ETA: 1:06:39] [loss: 4.305] [tokens/s: 319833.539] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 122 |
+
[2025-10-13 05:11:52][train:194][INFO] Running validation...
|
| 123 |
+
[2025-10-13 05:13:07][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 3142.596] [val/train_update_time: 1556.887] [val/loss: 4.261] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.317] [val/val_tokens_per_second: 551150.681] [val/loss_avg_len_2048: 4.261] [val/perplexity_len_2048: 70.871] [val/loss_avg_len_1024: 4.305] [val/perplexity_len_1024: 74.083] [val/loss_avg_len_512: 4.374] [val/perplexity_len_512: 79.348]
|
| 124 |
+
[2025-10-13 05:13:42][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 0:54:12] [ETA: 1:06:15] [loss: 4.239] [tokens/s: 286965.718] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 125 |
+
[2025-10-13 05:13:42][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3252.316] [train_eval/train_update_time: 1592.112] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.299] [train_eval/perplexity_len_2048: 73.599] [train_eval/loss_avg_len_1024: 4.339] [train_eval/perplexity_len_1024: 76.608] [train_eval/loss_avg_len_512: 4.404] [train_eval/perplexity_len_512: 81.781]
|
| 126 |
+
[2025-10-13 05:14:17][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 0:54:47] [ETA: 1:04:19] [loss: 4.203] [tokens/s: 319821.510] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 127 |
+
[2025-10-13 05:14:17][train:194][INFO] Running validation...
|
| 128 |
+
[2025-10-13 05:15:32][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 3287.701] [val/train_update_time: 1627.339] [val/loss: 4.223] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.425] [val/val_tokens_per_second: 550355.646] [val/loss_avg_len_2048: 4.223] [val/perplexity_len_2048: 68.256] [val/loss_avg_len_1024: 4.271] [val/perplexity_len_1024: 71.620] [val/loss_avg_len_512: 4.344] [val/perplexity_len_512: 76.978]
|
| 129 |
+
[2025-10-13 05:16:07][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 0:56:37] [ETA: 1:03:51] [loss: 4.192] [tokens/s: 286927.909] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 130 |
+
[2025-10-13 05:16:43][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 0:57:12] [ETA: 1:01:58] [loss: 4.181] [tokens/s: 319855.031] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 131 |
+
[2025-10-13 05:16:43][train:194][INFO] Running validation...
|
| 132 |
+
[2025-10-13 05:17:57][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 3432.890] [val/train_update_time: 1697.783] [val/loss: 4.185] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.375] [val/val_tokens_per_second: 550725.509] [val/loss_avg_len_2048: 4.185] [val/perplexity_len_2048: 65.697] [val/loss_avg_len_1024: 4.235] [val/perplexity_len_1024: 69.081] [val/loss_avg_len_512: 4.310] [val/perplexity_len_512: 74.471]
|
| 133 |
+
[2025-10-13 05:18:32][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 0:59:02] [ETA: 1:01:27] [loss: 4.173] [tokens/s: 286974.547] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 134 |
+
[2025-10-13 05:19:08][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 0:59:38] [ETA: 0:59:38] [loss: 4.142] [tokens/s: 320079.252] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 135 |
+
[2025-10-13 05:19:08][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3578.041] [train_eval/train_update_time: 1768.242] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.191] [train_eval/perplexity_len_2048: 66.116] [train_eval/loss_avg_len_1024: 4.237] [train_eval/perplexity_len_1024: 69.179] [train_eval/loss_avg_len_512: 4.311] [train_eval/perplexity_len_512: 74.482]
|
| 136 |
+
[2025-10-13 05:19:08][train:194][INFO] Running validation...
|
| 137 |
+
[2025-10-13 05:20:22][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 3578.041] [val/train_update_time: 1768.242] [val/loss: 4.149] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.343] [val/val_tokens_per_second: 550959.342] [val/loss_avg_len_2048: 4.149] [val/perplexity_len_2048: 63.383] [val/loss_avg_len_1024: 4.201] [val/perplexity_len_1024: 66.786] [val/loss_avg_len_512: 4.279] [val/perplexity_len_512: 72.190]
|
| 138 |
+
[2025-10-13 05:20:22][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001048576000.pt...
|
| 139 |
+
[2025-10-13 05:20:23][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001048576000.pt.
|
| 140 |
+
[2025-10-13 05:20:23][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.766]
|
| 141 |
+
[2025-10-13 05:20:58][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:01:28] [ETA: 0:59:03] [loss: 4.121] [tokens/s: 286864.354] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 142 |
+
[2025-10-13 05:21:34][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:02:03] [ETA: 0:57:17] [loss: 4.120] [tokens/s: 320109.733] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 143 |
+
[2025-10-13 05:21:34][train:194][INFO] Running validation...
|
| 144 |
+
[2025-10-13 05:22:48][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 3723.923] [val/train_update_time: 1838.693] [val/loss: 4.129] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.465] [val/val_tokens_per_second: 550059.590] [val/loss_avg_len_2048: 4.129] [val/perplexity_len_2048: 62.096] [val/loss_avg_len_1024: 4.183] [val/perplexity_len_1024: 65.589] [val/loss_avg_len_512: 4.264] [val/perplexity_len_512: 71.120]
|
| 145 |
+
[2025-10-13 05:23:24][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:03:53] [ETA: 0:56:39] [loss: 4.105] [tokens/s: 287144.886] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 146 |
+
[2025-10-13 05:23:59][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:04:29] [ETA: 0:54:55] [loss: 4.105] [tokens/s: 320047.173] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 147 |
+
[2025-10-13 05:23:59][train:194][INFO] Running validation...
|
| 148 |
+
[2025-10-13 05:25:13][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 3869.169] [val/train_update_time: 1909.147] [val/loss: 4.098] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.465] [val/val_tokens_per_second: 550059.656] [val/loss_avg_len_2048: 4.098] [val/perplexity_len_2048: 60.230] [val/loss_avg_len_1024: 4.154] [val/perplexity_len_1024: 63.666] [val/loss_avg_len_512: 4.235] [val/perplexity_len_512: 69.090]
|
| 149 |
+
[2025-10-13 05:25:49][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:06:19] [ETA: 0:54:15] [loss: 4.049] [tokens/s: 287099.609] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 150 |
+
[2025-10-13 05:25:49][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3979.018] [train_eval/train_update_time: 1944.378] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.112] [train_eval/perplexity_len_2048: 61.077] [train_eval/loss_avg_len_1024: 4.162] [train_eval/perplexity_len_1024: 64.171] [train_eval/loss_avg_len_512: 4.239] [train_eval/perplexity_len_512: 69.355]
|
| 151 |
+
[2025-10-13 05:26:24][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:06:54] [ETA: 0:52:34] [loss: 4.094] [tokens/s: 320036.846] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 152 |
+
[2025-10-13 05:26:24][train:194][INFO] Running validation...
|
| 153 |
+
[2025-10-13 05:27:39][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 4014.402] [val/train_update_time: 1979.612] [val/loss: 4.073] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.795] [val/val_tokens_per_second: 547632.517] [val/loss_avg_len_2048: 4.073] [val/perplexity_len_2048: 58.717] [val/loss_avg_len_1024: 4.129] [val/perplexity_len_1024: 62.139] [val/loss_avg_len_512: 4.212] [val/perplexity_len_512: 67.491]
|
| 154 |
+
[2025-10-13 05:28:14][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:08:44] [ETA: 0:51:51] [loss: 4.016] [tokens/s: 286946.236] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 155 |
+
[2025-10-13 05:28:50][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:09:19] [ETA: 0:50:12] [loss: 4.078] [tokens/s: 319822.627] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 156 |
+
[2025-10-13 05:28:50][train:194][INFO] Running validation...
|
| 157 |
+
[2025-10-13 05:30:04][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 4159.974] [val/train_update_time: 2050.064] [val/loss: 4.054] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.715] [val/val_tokens_per_second: 548218.992] [val/loss_avg_len_2048: 4.054] [val/perplexity_len_2048: 57.623] [val/loss_avg_len_1024: 4.111] [val/perplexity_len_1024: 61.029] [val/loss_avg_len_512: 4.195] [val/perplexity_len_512: 66.360]
|
| 158 |
+
[2025-10-13 05:30:40][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:11:10] [ETA: 0:49:27] [loss: 4.082] [tokens/s: 286811.315] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 159 |
+
[2025-10-13 05:31:15][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:11:45] [ETA: 0:47:50] [loss: 4.078] [tokens/s: 320020.010] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 160 |
+
[2025-10-13 05:31:15][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4305.465] [train_eval/train_update_time: 2120.524] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.053] [train_eval/perplexity_len_2048: 57.598] [train_eval/loss_avg_len_1024: 4.104] [train_eval/perplexity_len_1024: 60.585] [train_eval/loss_avg_len_512: 4.187] [train_eval/perplexity_len_512: 65.821]
|
| 161 |
+
[2025-10-13 05:31:15][train:194][INFO] Running validation...
|
| 162 |
+
[2025-10-13 05:32:30][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 4305.465] [val/train_update_time: 2120.524] [val/loss: 4.036] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.652] [val/val_tokens_per_second: 548676.153] [val/loss_avg_len_2048: 4.036] [val/perplexity_len_2048: 56.618] [val/loss_avg_len_1024: 4.095] [val/perplexity_len_1024: 60.022] [val/loss_avg_len_512: 4.180] [val/perplexity_len_512: 65.344]
|
| 163 |
+
[2025-10-13 05:32:30][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001258291200.pt...
|
| 164 |
+
[2025-10-13 05:32:31][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001258291200.pt.
|
| 165 |
+
[2025-10-13 05:32:31][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.753]
|
| 166 |
+
[2025-10-13 05:33:06][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:13:36] [ETA: 0:47:03] [loss: 4.035] [tokens/s: 286692.593] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 167 |
+
[2025-10-13 05:33:41][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:14:11] [ETA: 0:45:28] [loss: 4.013] [tokens/s: 319554.068] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 168 |
+
[2025-10-13 05:33:41][train:194][INFO] Running validation...
|
| 169 |
+
[2025-10-13 05:34:56][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 4451.645] [val/train_update_time: 2190.979] [val/loss: 4.019] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.978] [val/val_tokens_per_second: 546294.032] [val/loss_avg_len_2048: 4.019] [val/perplexity_len_2048: 55.629] [val/loss_avg_len_1024: 4.078] [val/perplexity_len_1024: 59.009] [val/loss_avg_len_512: 4.163] [val/perplexity_len_512: 64.282]
|
| 170 |
+
[2025-10-13 05:35:32][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:16:02] [ETA: 0:44:39] [loss: 4.033] [tokens/s: 286491.488] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 171 |
+
[2025-10-13 05:36:07][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:16:37] [ETA: 0:43:06] [loss: 4.017] [tokens/s: 319301.824] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 172 |
+
[2025-10-13 05:36:07][train:194][INFO] Running validation...
|
| 173 |
+
[2025-10-13 05:37:22][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 4597.406] [val/train_update_time: 2261.444] [val/loss: 4.007] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.170] [val/val_tokens_per_second: 544901.226] [val/loss_avg_len_2048: 4.007] [val/perplexity_len_2048: 54.968] [val/loss_avg_len_1024: 4.068] [val/perplexity_len_1024: 58.421] [val/loss_avg_len_512: 4.155] [val/perplexity_len_512: 63.732]
|
| 174 |
+
[2025-10-13 05:37:58][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:18:27] [ETA: 0:42:15] [loss: 3.986] [tokens/s: 286210.855] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 175 |
+
[2025-10-13 05:37:58][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4707.963] [train_eval/train_update_time: 2296.670] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.014] [train_eval/perplexity_len_2048: 55.371] [train_eval/loss_avg_len_1024: 4.071] [train_eval/perplexity_len_1024: 58.641] [train_eval/loss_avg_len_512: 4.157] [train_eval/perplexity_len_512: 63.850]
|
| 176 |
+
[2025-10-13 05:38:33][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:19:03] [ETA: 0:40:43] [loss: 4.021] [tokens/s: 319110.520] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 177 |
+
[2025-10-13 05:38:33][train:194][INFO] Running validation...
|
| 178 |
+
[2025-10-13 05:39:48][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 4743.360] [val/train_update_time: 2331.894] [val/loss: 3.992] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.694] [val/val_tokens_per_second: 548367.491] [val/loss_avg_len_2048: 3.992] [val/perplexity_len_2048: 54.190] [val/loss_avg_len_1024: 4.053] [val/perplexity_len_1024: 57.572] [val/loss_avg_len_512: 4.140] [val/perplexity_len_512: 62.823]
|
| 179 |
+
[2025-10-13 05:40:23][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:20:53] [ETA: 0:39:50] [loss: 3.965] [tokens/s: 286250.134] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000]
|
| 180 |
+
[2025-10-13 05:40:59][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:21:28] [ETA: 0:38:20] [loss: 3.980] [tokens/s: 319120.496] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 181 |
+
[2025-10-13 05:40:59][train:194][INFO] Running validation...
|
| 182 |
+
[2025-10-13 05:42:13][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 4888.829] [val/train_update_time: 2402.345] [val/loss: 3.982] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.634] [val/val_tokens_per_second: 548810.253] [val/loss_avg_len_2048: 3.982] [val/perplexity_len_2048: 53.631] [val/loss_avg_len_1024: 4.043] [val/perplexity_len_1024: 56.991] [val/loss_avg_len_512: 4.130] [val/perplexity_len_512: 62.207]
|
| 183 |
+
[2025-10-13 05:42:49][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:23:18] [ETA: 0:37:25] [loss: 3.989] [tokens/s: 286277.791] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 184 |
+
[2025-10-13 05:43:24][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:23:54] [ETA: 0:35:57] [loss: 3.984] [tokens/s: 319504.392] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 185 |
+
[2025-10-13 05:43:24][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5034.234] [train_eval/train_update_time: 2472.807] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.982] [train_eval/perplexity_len_2048: 53.604] [train_eval/loss_avg_len_1024: 4.040] [train_eval/perplexity_len_1024: 56.830] [train_eval/loss_avg_len_512: 4.128] [train_eval/perplexity_len_512: 62.023]
|
| 186 |
+
[2025-10-13 05:43:24][train:194][INFO] Running validation...
|
| 187 |
+
[2025-10-13 05:44:38][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 5034.234] [val/train_update_time: 2472.807] [val/loss: 3.972] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.000] [val/val_tokens_per_second: 553510.662] [val/loss_avg_len_2048: 3.972] [val/perplexity_len_2048: 53.094] [val/loss_avg_len_1024: 4.034] [val/perplexity_len_1024: 56.502] [val/loss_avg_len_512: 4.123] [val/perplexity_len_512: 61.738]
|
| 188 |
+
[2025-10-13 05:44:38][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001468006400.pt...
|
| 189 |
+
[2025-10-13 05:44:39][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001468006400.pt.
|
| 190 |
+
[2025-10-13 05:44:39][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.740]
|
| 191 |
+
[2025-10-13 05:45:14][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 1:25:44] [ETA: 0:35:01] [loss: 3.991] [tokens/s: 286543.113] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 192 |
+
[2025-10-13 05:45:49][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 1:26:19] [ETA: 0:33:34] [loss: 3.960] [tokens/s: 319622.722] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 193 |
+
[2025-10-13 05:45:49][train:194][INFO] Running validation...
|
| 194 |
+
[2025-10-13 05:47:04][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 5179.742] [val/train_update_time: 2543.273] [val/loss: 3.963] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.037] [val/val_tokens_per_second: 553237.635] [val/loss_avg_len_2048: 3.963] [val/perplexity_len_2048: 52.619] [val/loss_avg_len_1024: 4.025] [val/perplexity_len_1024: 55.989] [val/loss_avg_len_512: 4.114] [val/perplexity_len_512: 61.183]
|
| 195 |
+
[2025-10-13 05:47:39][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 1:28:09] [ETA: 0:32:36] [loss: 3.984] [tokens/s: 286928.205] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 196 |
+
[2025-10-13 05:48:14][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 1:28:44] [ETA: 0:31:10] [loss: 3.956] [tokens/s: 320199.551] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 197 |
+
[2025-10-13 05:48:14][train:194][INFO] Running validation...
|
| 198 |
+
[2025-10-13 05:49:29][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 5324.525] [val/train_update_time: 2613.729] [val/loss: 3.956] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.740] [val/val_tokens_per_second: 548032.886] [val/loss_avg_len_2048: 3.956] [val/perplexity_len_2048: 52.244] [val/loss_avg_len_1024: 4.018] [val/perplexity_len_1024: 55.585] [val/loss_avg_len_512: 4.107] [val/perplexity_len_512: 60.747]
|
| 199 |
+
[2025-10-13 05:50:04][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 1:30:34] [ETA: 0:30:11] [loss: 3.952] [tokens/s: 287106.576] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 200 |
+
[2025-10-13 05:50:04][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5434.649] [train_eval/train_update_time: 2648.955] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.958] [train_eval/perplexity_len_2048: 52.350] [train_eval/loss_avg_len_1024: 4.017] [train_eval/perplexity_len_1024: 55.539] [train_eval/loss_avg_len_512: 4.106] [train_eval/perplexity_len_512: 60.687]
|
| 201 |
+
[2025-10-13 05:50:40][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 1:31:10] [ETA: 0:28:47] [loss: 3.908] [tokens/s: 320185.929] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 202 |
+
[2025-10-13 05:50:40][train:194][INFO] Running validation...
|
| 203 |
+
[2025-10-13 05:51:54][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 5470.030] [val/train_update_time: 2684.182] [val/loss: 3.948] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.592] [val/val_tokens_per_second: 549120.775] [val/loss_avg_len_2048: 3.948] [val/perplexity_len_2048: 51.845] [val/loss_avg_len_1024: 4.011] [val/perplexity_len_1024: 55.196] [val/loss_avg_len_512: 4.100] [val/perplexity_len_512: 60.359]
|
| 204 |
+
[2025-10-13 05:52:30][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 1:32:59] [ETA: 0:27:46] [loss: 3.980] [tokens/s: 287156.125] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 205 |
+
[2025-10-13 05:53:05][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 1:33:35] [ETA: 0:26:23] [loss: 3.917] [tokens/s: 320216.572] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 206 |
+
[2025-10-13 05:53:05][train:194][INFO] Running validation...
|
| 207 |
+
[2025-10-13 05:54:19][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 5615.375] [val/train_update_time: 2754.645] [val/loss: 3.942] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.136] [val/val_tokens_per_second: 552495.297] [val/loss_avg_len_2048: 3.942] [val/perplexity_len_2048: 51.530] [val/loss_avg_len_1024: 4.005] [val/perplexity_len_1024: 54.892] [val/loss_avg_len_512: 4.095] [val/perplexity_len_512: 60.056]
|
| 208 |
+
[2025-10-13 05:54:55][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 1:35:24] [ETA: 0:25:21] [loss: 3.934] [tokens/s: 287362.107] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 209 |
+
[2025-10-13 05:55:30][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 1:36:00] [ETA: 0:24:00] [loss: 3.919] [tokens/s: 320523.542] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 210 |
+
[2025-10-13 05:55:30][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5760.270] [train_eval/train_update_time: 2825.100] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.941] [train_eval/perplexity_len_2048: 51.489] [train_eval/loss_avg_len_1024: 4.003] [train_eval/perplexity_len_1024: 54.748] [train_eval/loss_avg_len_512: 4.092] [train_eval/perplexity_len_512: 59.830]
|
| 211 |
+
[2025-10-13 05:55:30][train:194][INFO] Running validation...
|
| 212 |
+
[2025-10-13 05:56:44][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 5760.270] [val/train_update_time: 2825.100] [val/loss: 3.937] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.038] [val/val_tokens_per_second: 553225.990] [val/loss_avg_len_2048: 3.937] [val/perplexity_len_2048: 51.272] [val/loss_avg_len_1024: 4.001] [val/perplexity_len_1024: 54.634] [val/loss_avg_len_512: 4.091] [val/perplexity_len_512: 59.784]
|
| 213 |
+
[2025-10-13 05:56:44][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001677721600.pt...
|
| 214 |
+
[2025-10-13 05:56:45][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001677721600.pt.
|
| 215 |
+
[2025-10-13 05:56:45][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.735]
|
| 216 |
+
[2025-10-13 05:57:20][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 1:37:50] [ETA: 0:22:57] [loss: 3.897] [tokens/s: 287360.370] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 217 |
+
[2025-10-13 05:57:56][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 1:38:25] [ETA: 0:21:36] [loss: 3.908] [tokens/s: 320169.473] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 218 |
+
[2025-10-13 05:57:56][train:194][INFO] Running validation...
|
| 219 |
+
[2025-10-13 05:59:09][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 5905.784] [val/train_update_time: 2895.551] [val/loss: 3.933] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 73.876] [val/val_tokens_per_second: 554444.243] [val/loss_avg_len_2048: 3.933] [val/perplexity_len_2048: 51.049] [val/loss_avg_len_1024: 3.996] [val/perplexity_len_1024: 54.401] [val/loss_avg_len_512: 4.087] [val/perplexity_len_512: 59.543]
|
| 220 |
+
[2025-10-13 05:59:45][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 1:40:15] [ETA: 0:20:31] [loss: 3.945] [tokens/s: 287418.718] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 221 |
+
[2025-10-13 06:00:20][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 1:40:50] [ETA: 0:19:12] [loss: 3.879] [tokens/s: 320586.351] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 222 |
+
[2025-10-13 06:00:20][train:194][INFO] Running validation...
|
| 223 |
+
[2025-10-13 06:01:34][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 6050.427] [val/train_update_time: 2966.012] [val/loss: 3.929] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 73.964] [val/val_tokens_per_second: 553782.382] [val/loss_avg_len_2048: 3.929] [val/perplexity_len_2048: 50.862] [val/loss_avg_len_1024: 3.993] [val/perplexity_len_1024: 54.210] [val/loss_avg_len_512: 4.083] [val/perplexity_len_512: 59.342]
|
| 224 |
+
[2025-10-13 06:02:09][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 1:42:39] [ETA: 0:18:07] [loss: 3.958] [tokens/s: 287733.029] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 225 |
+
[2025-10-13 06:02:09][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6159.759] [train_eval/train_update_time: 3001.234] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.922] [train_eval/perplexity_len_2048: 50.512] [train_eval/loss_avg_len_1024: 3.977] [train_eval/perplexity_len_1024: 53.381] [train_eval/loss_avg_len_512: 4.067] [train_eval/perplexity_len_512: 58.378]
|
| 226 |
+
[2025-10-13 06:02:45][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 1:43:15] [ETA: 0:16:48] [loss: 3.943] [tokens/s: 320910.573] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 227 |
+
[2025-10-13 06:02:45][train:194][INFO] Running validation...
|
| 228 |
+
[2025-10-13 06:03:59][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 6195.132] [val/train_update_time: 3036.460] [val/loss: 3.926] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.036] [val/val_tokens_per_second: 553245.430] [val/loss_avg_len_2048: 3.926] [val/perplexity_len_2048: 50.709] [val/loss_avg_len_1024: 3.990] [val/perplexity_len_1024: 54.070] [val/loss_avg_len_512: 4.081] [val/perplexity_len_512: 59.202]
|
| 229 |
+
[2025-10-13 06:04:34][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 1:45:04] [ETA: 0:15:42] [loss: 3.885] [tokens/s: 287953.382] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 230 |
+
[2025-10-13 06:05:10][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 1:45:39] [ETA: 0:14:24] [loss: 3.896] [tokens/s: 320956.418] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 231 |
+
[2025-10-13 06:05:10][train:194][INFO] Running validation...
|
| 232 |
+
[2025-10-13 06:06:24][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 6339.925] [val/train_update_time: 3106.918] [val/loss: 3.924] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.078] [val/val_tokens_per_second: 552927.769] [val/loss_avg_len_2048: 3.924] [val/perplexity_len_2048: 50.581] [val/loss_avg_len_1024: 3.988] [val/perplexity_len_1024: 53.922] [val/loss_avg_len_512: 4.078] [val/perplexity_len_512: 59.038]
|
| 233 |
+
[2025-10-13 06:06:59][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 1:47:29] [ETA: 0:13:17] [loss: 3.967] [tokens/s: 287977.586] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 234 |
+
[2025-10-13 06:07:34][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 1:48:04] [ETA: 0:12:00] [loss: 3.874] [tokens/s: 321311.382] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 235 |
+
[2025-10-13 06:07:34][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6484.748] [train_eval/train_update_time: 3177.374] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.920] [train_eval/perplexity_len_2048: 50.392] [train_eval/loss_avg_len_1024: 3.978] [train_eval/perplexity_len_1024: 53.434] [train_eval/loss_avg_len_512: 4.068] [train_eval/perplexity_len_512: 58.462]
|
| 236 |
+
[2025-10-13 06:07:34][train:194][INFO] Running validation...
|
| 237 |
+
[2025-10-13 06:08:49][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 6484.748] [val/train_update_time: 3177.374] [val/loss: 3.922] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.161] [val/val_tokens_per_second: 552311.224] [val/loss_avg_len_2048: 3.922] [val/perplexity_len_2048: 50.491] [val/loss_avg_len_1024: 3.986] [val/perplexity_len_1024: 53.833] [val/loss_avg_len_512: 4.077] [val/perplexity_len_512: 58.949]
|
| 238 |
+
[2025-10-13 06:08:49][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001887436800.pt...
|
| 239 |
+
[2025-10-13 06:08:49][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001887436800.pt.
|
| 240 |
+
[2025-10-13 06:08:49][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.732]
|
| 241 |
+
[2025-10-13 06:09:25][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 1:49:55] [ETA: 0:10:52] [loss: 3.919] [tokens/s: 287928.669] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 242 |
+
[2025-10-13 06:10:00][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 1:50:30] [ETA: 0:09:36] [loss: 3.935] [tokens/s: 320793.117] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 243 |
+
[2025-10-13 06:10:00][train:194][INFO] Running validation...
|
| 244 |
+
[2025-10-13 06:11:14][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 6630.406] [val/train_update_time: 3247.843] [val/loss: 3.920] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.370] [val/val_tokens_per_second: 550762.741] [val/loss_avg_len_2048: 3.920] [val/perplexity_len_2048: 50.421] [val/loss_avg_len_1024: 3.985] [val/perplexity_len_1024: 53.758] [val/loss_avg_len_512: 4.075] [val/perplexity_len_512: 58.867]
|
| 245 |
+
[2025-10-13 06:11:50][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 1:52:20] [ETA: 0:08:27] [loss: 3.925] [tokens/s: 287726.244] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 246 |
+
[2025-10-13 06:12:25][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 1:52:55] [ETA: 0:07:12] [loss: 3.888] [tokens/s: 320591.534] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000]
|
| 247 |
+
[2025-10-13 06:12:25][train:194][INFO] Running validation...
|
| 248 |
+
[2025-10-13 06:13:41][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 6775.544] [val/train_update_time: 3318.317] [val/loss: 3.920] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.727] [val/val_tokens_per_second: 540893.042] [val/loss_avg_len_2048: 3.920] [val/perplexity_len_2048: 50.377] [val/loss_avg_len_1024: 3.984] [val/perplexity_len_1024: 53.718] [val/loss_avg_len_512: 4.075] [val/perplexity_len_512: 58.828]
|
| 249 |
+
[2025-10-13 06:14:16][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 1:54:46] [ETA: 0:06:02] [loss: 3.896] [tokens/s: 287022.299] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 250 |
+
[2025-10-13 06:14:16][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6886.654] [train_eval/train_update_time: 3353.546] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.909] [train_eval/perplexity_len_2048: 49.853] [train_eval/loss_avg_len_1024: 3.971] [train_eval/perplexity_len_1024: 53.030] [train_eval/loss_avg_len_512: 4.060] [train_eval/perplexity_len_512: 57.986]
|
| 251 |
+
[2025-10-13 06:14:52][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 1:55:22] [ETA: 0:04:48] [loss: 3.897] [tokens/s: 319745.942] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 252 |
+
[2025-10-13 06:14:52][train:194][INFO] Running validation...
|
| 253 |
+
[2025-10-13 06:16:07][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 6922.034] [val/train_update_time: 3388.776] [val/loss: 3.919] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.748] [val/val_tokens_per_second: 547974.082] [val/loss_avg_len_2048: 3.919] [val/perplexity_len_2048: 50.352] [val/loss_avg_len_1024: 3.983] [val/perplexity_len_1024: 53.689] [val/loss_avg_len_512: 4.074] [val/perplexity_len_512: 58.794]
|
| 254 |
+
[2025-10-13 06:16:42][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 1:57:12] [ETA: 0:03:37] [loss: 3.929] [tokens/s: 286734.887] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000]
|
| 255 |
+
[2025-10-13 06:17:17][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 1:57:47] [ETA: 0:02:24] [loss: 3.899] [tokens/s: 319413.416] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000]
|
| 256 |
+
[2025-10-13 06:17:17][train:194][INFO] Running validation...
|
| 257 |
+
[2025-10-13 06:18:31][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 7067.550] [val/train_update_time: 3459.240] [val/loss: 3.919] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 73.896] [val/val_tokens_per_second: 554295.052] [val/loss_avg_len_2048: 3.919] [val/perplexity_len_2048: 50.339] [val/loss_avg_len_1024: 3.983] [val/perplexity_len_1024: 53.676] [val/loss_avg_len_512: 4.074] [val/perplexity_len_512: 58.781]
|
| 258 |
+
[2025-10-13 06:18:31][train:854][INFO] Training finished with 2055208960 tokens!
|
metrics/jsonlines/checkpoint.jsonl
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"step": 209715200, "checkpoint/checkpoint_time": 0.6384434180217795}
|
| 2 |
+
{"step": 419430400, "checkpoint/checkpoint_time": 0.626950032019522}
|
| 3 |
+
{"step": 629145600, "checkpoint/checkpoint_time": 0.6112130110268481}
|
| 4 |
+
{"step": 838860800, "checkpoint/checkpoint_time": 0.6657388239982538}
|
| 5 |
+
{"step": 1048576000, "checkpoint/checkpoint_time": 0.7660488990368322}
|
| 6 |
+
{"step": 1258291200, "checkpoint/checkpoint_time": 0.753050519968383}
|
| 7 |
+
{"step": 1468006400, "checkpoint/checkpoint_time": 0.7402905119815841}
|
| 8 |
+
{"step": 1677721600, "checkpoint/checkpoint_time": 0.7347163640079089}
|
| 9 |
+
{"step": 1887436800, "checkpoint/checkpoint_time": 0.7318841179949231}
|
metrics/jsonlines/model_info.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 0, "model_info/total_params": 27449096, "model_info/trainable_params": 27449096, "model_info/embedding_params": 12870912, "model_info/flops_per_token": 0, "model_info/non_embedding_params": 14578184}
|
metrics/jsonlines/norm.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/resume.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 0, "resume/resume_step": 0}
|
metrics/jsonlines/throughput.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/train.jsonl
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 41.97735304199159, "train/update_time": 41.75535514205694, "train/lr": 0.0009000000000000001, "train/loss": 9.76972484588623, "train/global_grad_norm": 1.1996515989303589}
|
| 2 |
+
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 77.36710998200579, "train/update_time": 76.98762350907782, "train/lr": 0.0009997960964140947, "train/loss": 8.079014778137207, "train/global_grad_norm": 0.9576020836830139}
|
| 3 |
+
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 188.23102539300453, "train/update_time": 112.21908622107003, "train/lr": 0.0009990914580222257, "train/loss": 7.444537162780762, "train/global_grad_norm": 0.42209190130233765}
|
| 4 |
+
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 223.6157359869685, "train/update_time": 147.44638093106914, "train/lr": 0.0009978842768382998, "train/loss": 7.105719089508057, "train/global_grad_norm": 0.3969765603542328}
|
| 5 |
+
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 333.5220514299581, "train/update_time": 182.68076597800246, "train/lr": 0.0009961757683914405, "train/loss": 6.857090473175049, "train/global_grad_norm": 0.5435984134674072}
|
| 6 |
+
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 368.9031989739742, "train/update_time": 217.90960857702885, "train/lr": 0.00099396765300483, "train/loss": 6.596991539001465, "train/global_grad_norm": 0.43834808468818665}
|
| 7 |
+
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 478.86176590196555, "train/update_time": 253.13642803102266, "train/lr": 0.0009912621540634887, "train/loss": 6.422722339630127, "train/global_grad_norm": 0.6763378381729126}
|
| 8 |
+
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 514.2439901359612, "train/update_time": 288.3617576470133, "train/lr": 0.000988061995775515, "train/loss": 6.240057945251465, "train/global_grad_norm": 0.45933476090431213}
|
| 9 |
+
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 624.0890151349595, "train/update_time": 323.589894320874, "train/lr": 0.0009843704004290394, "train/loss": 6.051237106323242, "train/global_grad_norm": 0.360037237405777}
|
| 10 |
+
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 659.4609651200008, "train/update_time": 358.814332414011, "train/lr": 0.0009801910851476522, "train/loss": 5.918942928314209, "train/global_grad_norm": 0.62600177526474}
|
| 11 |
+
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 770.1425092020072, "train/update_time": 394.0443641850143, "train/lr": 0.0009755282581475768, "train/loss": 5.784972667694092, "train/global_grad_norm": 0.44626691937446594}
|
| 12 |
+
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 805.5059107720153, "train/update_time": 429.2700012290734, "train/lr": 0.0009703866145003512, "train/loss": 5.645214557647705, "train/global_grad_norm": 0.4792501628398895}
|
| 13 |
+
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 916.1022540619597, "train/update_time": 464.49939665506827, "train/lr": 0.0009647713314052896, "train/loss": 5.578414440155029, "train/global_grad_norm": 0.679843544960022}
|
| 14 |
+
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 951.4780202080146, "train/update_time": 499.73381894716294, "train/lr": 0.0009586880629764817, "train/loss": 5.486634254455566, "train/global_grad_norm": 0.6061164736747742}
|
| 15 |
+
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1061.4905991089763, "train/update_time": 534.9619731742423, "train/lr": 0.0009521429345495787, "train/loss": 5.374250411987305, "train/global_grad_norm": 0.7468693256378174}
|
| 16 |
+
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1096.8764924500138, "train/update_time": 570.1910730601521, "train/lr": 0.0009451425365140996, "train/loss": 5.32179069519043, "train/global_grad_norm": 0.4829612672328949}
|
| 17 |
+
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1206.983663285966, "train/update_time": 605.4215191720868, "train/lr": 0.000937693917677468, "train/loss": 5.216552734375, "train/global_grad_norm": 0.40984582901000977}
|
| 18 |
+
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1242.364284639014, "train/update_time": 640.6497024221462, "train/lr": 0.0009298045781674596, "train/loss": 5.194259166717529, "train/global_grad_norm": 0.6644951105117798}
|
| 19 |
+
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1352.8268752049771, "train/update_time": 675.885561926174, "train/lr": 0.0009214824618802108, "train/loss": 5.163832187652588, "train/global_grad_norm": 0.6656083464622498}
|
| 20 |
+
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1388.2071847780026, "train/update_time": 711.1109357241658, "train/lr": 0.000912735948481387, "train/loss": 5.068937301635742, "train/global_grad_norm": 0.4672752618789673}
|
| 21 |
+
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 1498.7065829180065, "train/update_time": 746.3382132861298, "train/lr": 0.0009035738449685707, "train/loss": 5.02679443359375, "train/global_grad_norm": 0.6419569253921509}
|
| 22 |
+
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 1534.1124866969767, "train/update_time": 781.5830888972268, "train/lr": 0.0008940053768033609, "train/loss": 4.984641075134277, "train/global_grad_norm": 0.486380398273468}
|
| 23 |
+
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 1645.136862017971, "train/update_time": 816.84872851416, "train/lr": 0.0008840401786221159, "train/loss": 4.932381629943848, "train/global_grad_norm": 0.7352603077888489}
|
| 24 |
+
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 1680.500316324993, "train/update_time": 852.0965187721886, "train/lr": 0.0008736882845346905, "train/loss": 4.874920845031738, "train/global_grad_norm": 0.5009574890136719}
|
| 25 |
+
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 1792.9284177150112, "train/update_time": 887.3467938142712, "train/lr": 0.0008629601180209381, "train/loss": 4.870193958282471, "train/global_grad_norm": 0.6007606387138367}
|
| 26 |
+
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 1828.322631730989, "train/update_time": 922.6015984143014, "train/lr": 0.0008518664814351503, "train/loss": 4.817152976989746, "train/global_grad_norm": 0.4920603632926941}
|
| 27 |
+
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 1940.5543955019675, "train/update_time": 957.8616747342749, "train/lr": 0.0008404185451290017, "train/loss": 4.805738925933838, "train/global_grad_norm": 0.7058248519897461}
|
| 28 |
+
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 1975.955018882989, "train/update_time": 993.1225579883321, "train/lr": 0.0008286278362039527, "train/loss": 4.739347457885742, "train/global_grad_norm": 0.5552049279212952}
|
| 29 |
+
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2087.7305572130135, "train/update_time": 1028.3721907203435, "train/lr": 0.0008165062269044352, "train/loss": 4.710965633392334, "train/global_grad_norm": 0.6555073261260986}
|
| 30 |
+
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 2123.1104451339925, "train/update_time": 1063.6117942532874, "train/lr": 0.0008040659226635089, "train/loss": 4.675158500671387, "train/global_grad_norm": 0.54646235704422}
|
| 31 |
+
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 2233.8338464839617, "train/update_time": 1098.8516714693396, "train/lr": 0.0007913194498130252, "train/loss": 4.6881422996521, "train/global_grad_norm": 0.4385336637496948}
|
| 32 |
+
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 2269.22137821099, "train/update_time": 1134.096047840314, "train/lr": 0.000778279642970672, "train/loss": 4.613880157470703, "train/global_grad_norm": 0.6974025368690491}
|
| 33 |
+
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 2379.2475059829885, "train/update_time": 1169.336060541391, "train/lr": 0.0007649596321166025, "train/loss": 4.6237874031066895, "train/global_grad_norm": 0.623008131980896}
|
| 34 |
+
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 2414.640490865975, "train/update_time": 1204.5815564935328, "train/lr": 0.0007513728293726579, "train/loss": 4.571319580078125, "train/global_grad_norm": 0.8349814414978027}
|
| 35 |
+
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 2525.280265790003, "train/update_time": 1239.831252818578, "train/lr": 0.0007375329154974975, "train/loss": 4.533912181854248, "train/global_grad_norm": 0.5782769322395325}
|
| 36 |
+
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 2560.696846612962, "train/update_time": 1275.0923811426037, "train/lr": 0.0007234538261112341, "train/loss": 4.453745365142822, "train/global_grad_norm": 0.5227785706520081}
|
| 37 |
+
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 2670.3778264659923, "train/update_time": 1310.3158643786446, "train/lr": 0.0007091497376634464, "train/loss": 4.4719929695129395, "train/global_grad_norm": 1.180794596672058}
|
| 38 |
+
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 2705.7711879389826, "train/update_time": 1345.540644572582, "train/lr": 0.0006946350531586958, "train/loss": 4.423107624053955, "train/global_grad_norm": 0.6167490482330322}
|
| 39 |
+
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 2815.635458876961, "train/update_time": 1380.7573516704724, "train/lr": 0.0006799243876539214, "train/loss": 4.413957595825195, "train/global_grad_norm": 0.8560699820518494}
|
| 40 |
+
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 2851.024827848014, "train/update_time": 1415.9805577184306, "train/lr": 0.0006650325535423166, "train/loss": 4.290282249450684, "train/global_grad_norm": 0.5771048665046692}
|
| 41 |
+
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 2961.242350918008, "train/update_time": 1451.2064463463612, "train/lr": 0.0006499745456385053, "train/loss": 4.317328929901123, "train/global_grad_norm": 0.8895323276519775}
|
| 42 |
+
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 2996.6310340749915, "train/update_time": 1486.4294549234328, "train/lr": 0.0006347655260800339, "train/loss": 4.294151782989502, "train/global_grad_norm": 0.5844022035598755}
|
| 43 |
+
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 3107.195180976996, "train/update_time": 1521.6597315414692, "train/lr": 0.0006194208090603844, "train/loss": 4.289255142211914, "train/global_grad_norm": 0.7929869890213013}
|
| 44 |
+
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 3142.596222635999, "train/update_time": 1556.8869810854085, "train/lr": 0.0006039558454088796, "train/loss": 4.304691791534424, "train/global_grad_norm": 0.7291324138641357}
|
| 45 |
+
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 3252.3157649660134, "train/update_time": 1592.1117133093649, "train/lr": 0.0005883862070330078, "train/loss": 4.23949670791626, "train/global_grad_norm": 0.66145920753479}
|
| 46 |
+
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 3287.7011176730157, "train/update_time": 1627.3387761343038, "train/lr": 0.0005727275712388317, "train/loss": 4.2027587890625, "train/global_grad_norm": 0.7958371043205261}
|
| 47 |
+
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 3397.5104132720153, "train/update_time": 1662.5627456933726, "train/lr": 0.0005569957049452703, "train/loss": 4.191956043243408, "train/global_grad_norm": 0.7229559421539307}
|
| 48 |
+
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 3432.890017031983, "train/update_time": 1697.7829640183481, "train/lr": 0.0005412064488081482, "train/loss": 4.181127548217773, "train/global_grad_norm": 0.9045247435569763}
|
| 49 |
+
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 3542.651402093994, "train/update_time": 1733.0092777723912, "train/lr": 0.0005253757012699972, "train/loss": 4.1732048988342285, "train/global_grad_norm": 0.5767119526863098}
|
| 50 |
+
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 3578.0405803910107, "train/update_time": 1768.2423270724248, "train/lr": 0.0005095194025516734, "train/loss": 4.142397403717041, "train/global_grad_norm": 0.7807267904281616}
|
| 51 |
+
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 3688.537352617015, "train/update_time": 1803.4673421693733, "train/lr": 0.0004936535186019053, "train/loss": 4.12105131149292, "train/global_grad_norm": 0.6827822327613831}
|
| 52 |
+
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 3723.92296086601, "train/update_time": 1838.693349173409, "train/lr": 0.00047779402502093696, "train/loss": 4.119822978973389, "train/global_grad_norm": 1.0019612312316895}
|
| 53 |
+
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 3833.784007055976, "train/update_time": 1873.921633931459, "train/lr": 0.0004619568909744525, "train/loss": 4.105185508728027, "train/global_grad_norm": 1.0723958015441895}
|
| 54 |
+
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 3869.1690179569996, "train/update_time": 1909.1470028955955, "train/lr": 0.00044615806311398067, "train/loss": 4.104675769805908, "train/global_grad_norm": 0.6603816151618958}
|
| 55 |
+
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 3979.0184022249887, "train/update_time": 1944.3775492714485, "train/lr": 0.0004304134495199673, "train/loss": 4.048872947692871, "train/global_grad_norm": 0.6948413252830505}
|
| 56 |
+
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 4014.40222099697, "train/update_time": 1979.6119488844415, "train/lr": 0.0004147389036836882, "train/loss": 4.09351110458374, "train/global_grad_norm": 0.571071207523346}
|
| 57 |
+
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 4124.598098332004, "train/update_time": 2014.8426124633406, "train/lr": 0.0003991502085441259, "train/loss": 4.016101360321045, "train/global_grad_norm": 0.6385037302970886}
|
| 58 |
+
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 4159.974407147965, "train/update_time": 2050.0642815562896, "train/lr": 0.0003836630605958888, "train/loss": 4.078490734100342, "train/global_grad_norm": 0.8873701691627502}
|
| 59 |
+
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 4270.076159979973, "train/update_time": 2085.291462272231, "train/lr": 0.00036829305408417155, "train/loss": 4.081883430480957, "train/global_grad_norm": 0.699393630027771}
|
| 60 |
+
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 4305.46517852298, "train/update_time": 2120.5240447262186, "train/lr": 0.000353055665302672, "train/loss": 4.077620029449463, "train/global_grad_norm": 0.5510751605033875}
|
| 61 |
+
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 4416.261696751986, "train/update_time": 2155.7523099503014, "train/lr": 0.0003379662370102746, "train/loss": 4.03539514541626, "train/global_grad_norm": 0.4872359335422516}
|
| 62 |
+
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 4451.645092452003, "train/update_time": 2190.979344193125, "train/lr": 0.00032303996298219405, "train/loss": 4.0130109786987305, "train/global_grad_norm": 0.7172174453735352}
|
| 63 |
+
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 4562.018341306015, "train/update_time": 2226.215099543275, "train/lr": 0.00030829187271113034, "train/loss": 4.032833099365234, "train/global_grad_norm": 1.3824615478515625}
|
| 64 |
+
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 4597.405511869991, "train/update_time": 2261.444451206189, "train/lr": 0.0002937368162738445, "train/loss": 4.016963958740234, "train/global_grad_norm": 0.6392601132392883}
|
| 65 |
+
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 4707.963436296966, "train/update_time": 2296.670486221323, "train/lr": 0.0002793894493783894, "train/loss": 3.9857394695281982, "train/global_grad_norm": 0.5544307231903076}
|
| 66 |
+
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 4743.360288269003, "train/update_time": 2331.894208611455, "train/lr": 0.00026526421860705474, "train/loss": 4.021228313446045, "train/global_grad_norm": 0.5490173101425171}
|
| 67 |
+
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 4853.4431308630155, "train/update_time": 2367.121892035415, "train/lr": 0.0002513753468698824, "train/loss": 3.9647693634033203, "train/global_grad_norm": 1.3666198253631592}
|
| 68 |
+
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 4888.828857862973, "train/update_time": 2402.3454629034386, "train/lr": 0.00023773681908340283, "train/loss": 3.980109691619873, "train/global_grad_norm": 0.8777939081192017}
|
| 69 |
+
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 4998.851999056002, "train/update_time": 2437.5750640144106, "train/lr": 0.00022436236808900823, "train/loss": 3.9890389442443848, "train/global_grad_norm": 0.5416154265403748}
|
| 70 |
+
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 5034.233835026971, "train/update_time": 2472.8065605463926, "train/lr": 0.00021126546082514682, "train/loss": 3.9841649532318115, "train/global_grad_norm": 0.45246919989585876}
|
| 71 |
+
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 5144.364876898995, "train/update_time": 2508.037551375397, "train/lr": 0.00019845928476725522, "train/loss": 3.9905643463134766, "train/global_grad_norm": 0.6189930438995361}
|
| 72 |
+
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 5179.741817264992, "train/update_time": 2543.2725221014116, "train/lr": 0.0001859567346490913, "train/loss": 3.959702491760254, "train/global_grad_norm": 0.4403487741947174}
|
| 73 |
+
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 5289.151538159, "train/update_time": 2578.4985305793816, "train/lr": 0.00017377039947882782, "train/loss": 3.9835543632507324, "train/global_grad_norm": 0.47642815113067627}
|
| 74 |
+
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 5324.524823835993, "train/update_time": 2613.7285769192385, "train/lr": 0.00016191254986299043, "train/loss": 3.9557220935821533, "train/global_grad_norm": 1.0352903604507446}
|
| 75 |
+
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 5434.649355244997, "train/update_time": 2648.9550557791954, "train/lr": 0.00015039512565099468, "train/loss": 3.951629877090454, "train/global_grad_norm": 0.3987804055213928}
|
| 76 |
+
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 5470.030363440979, "train/update_time": 2684.182179984171, "train/lr": 0.00013922972391273224, "train/loss": 3.9082868099212646, "train/global_grad_norm": 0.39810803532600403}
|
| 77 |
+
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 5579.999253949965, "train/update_time": 2719.416021748213, "train/lr": 0.00012842758726130281, "train/loss": 3.979860305786133, "train/global_grad_norm": 0.43757563829421997}
|
| 78 |
+
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 5615.37508792599, "train/update_time": 2754.6453251581406, "train/lr": 0.00011799959253265679, "train/loss": 3.917025566101074, "train/global_grad_norm": 0.4092726707458496}
|
| 79 |
+
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 5724.889530220011, "train/update_time": 2789.872535758186, "train/lr": 0.00010795623983354214, "train/loss": 3.9336912631988525, "train/global_grad_norm": 0.445840060710907}
|
| 80 |
+
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 5760.269799852977, "train/update_time": 2825.100109060295, "train/lr": 9.830764196878872e-05, "train/loss": 3.918724298477173, "train/global_grad_norm": 0.3501419723033905}
|
| 81 |
+
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 5870.409012118005, "train/update_time": 2860.3205174013856, "train/lr": 8.906351425856951e-05, "train/loss": 3.8968920707702637, "train/global_grad_norm": 0.39583295583724976}
|
| 82 |
+
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 5905.784063692961, "train/update_time": 2895.5508844144642, "train/lr": 8.02331647558977e-05, "train/loss": 3.9078526496887207, "train/global_grad_norm": 0.32103216648101807}
|
| 83 |
+
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 6015.045884961961, "train/update_time": 2930.7809853444924, "train/lr": 7.182548487420554e-05, "train/loss": 3.9445602893829346, "train/global_grad_norm": 0.2946698069572449}
|
| 84 |
+
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 6050.427405752998, "train/update_time": 2966.012128848466, "train/lr": 6.384894043444556e-05, "train/loss": 3.87947940826416, "train/global_grad_norm": 0.35237979888916016}
|
| 85 |
+
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 6159.759159684996, "train/update_time": 3001.234070145467, "train/lr": 5.6311563140726166e-05, "train/loss": 3.9575512409210205, "train/global_grad_norm": 0.3177047371864319}
|
| 86 |
+
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 6195.132035595016, "train/update_time": 3036.459684428468, "train/lr": 4.922094249306547e-05, "train/loss": 3.9432151317596436, "train/global_grad_norm": 0.3355129659175873}
|
| 87 |
+
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 6304.553084079002, "train/update_time": 3071.6873910723953, "train/lr": 4.2584218145409916e-05, "train/loss": 3.884796380996704, "train/global_grad_norm": 0.2935344874858856}
|
| 88 |
+
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 6339.925039035967, "train/update_time": 3106.918331800378, "train/lr": 3.6408072716606236e-05, "train/loss": 3.8955740928649902, "train/global_grad_norm": 0.27834922075271606}
|
| 89 |
+
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 6449.384100833966, "train/update_time": 3142.1480970325065, "train/lr": 3.069872506157217e-05, "train/loss": 3.967247486114502, "train/global_grad_norm": 0.3791719973087311}
|
| 90 |
+
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 6484.7479819079745, "train/update_time": 3177.374425999529, "train/lr": 2.5461924009435368e-05, "train/loss": 3.8741910457611084, "train/global_grad_norm": 0.28777578473091125}
|
| 91 |
+
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 6595.026013074967, "train/update_time": 3212.6112683996907, "train/lr": 2.0702942574950812e-05, "train/loss": 3.9192233085632324, "train/global_grad_norm": 0.22005437314510345}
|
| 92 |
+
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 6630.405630423978, "train/update_time": 3247.8434043628513, "train/lr": 1.642657264902142e-05, "train/loss": 3.934617280960083, "train/global_grad_norm": 0.25288376212120056}
|
| 93 |
+
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 6740.169189045962, "train/update_time": 3283.0864938918385, "train/lr": 1.2637120173670358e-05, "train/loss": 3.9254136085510254, "train/global_grad_norm": 0.22438837587833405}
|
| 94 |
+
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 6775.5438796009985, "train/update_time": 3318.3166800447507, "train/lr": 9.338400806321978e-06, "train/loss": 3.887643575668335, "train/global_grad_norm": 0.22849982976913452}
|
| 95 |
+
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 6886.654366842995, "train/update_time": 3353.5464223485906, "train/lr": 6.533736077758867e-06, "train/loss": 3.896052598953247, "train/global_grad_norm": 0.22606733441352844}
|
| 96 |
+
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 6922.033780722995, "train/update_time": 3388.7756955446093, "train/lr": 4.2259500476214406e-06, "train/loss": 3.8972818851470947, "train/global_grad_norm": 0.21224528551101685}
|
| 97 |
+
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 7032.169445707987, "train/update_time": 3424.0070785805583, "train/lr": 2.417366460819359e-06, "train/loss": 3.9289143085479736, "train/global_grad_norm": 0.2368578314781189}
|
| 98 |
+
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 7067.550158863014, "train/update_time": 3459.239731548645, "train/lr": 1.1098064077174619e-06, "train/loss": 3.8993091583251953, "train/global_grad_norm": 0.20862537622451782}
|
metrics/jsonlines/train_data_info.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024}
|
metrics/jsonlines/train_eval.jsonl
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 333.5220514299581, "train_eval/train_update_time": 182.68076597800246, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.218650683912129, "train_eval/perplexity_len_2048": 3709.4937245794213, "train_eval/loss_avg_len_1024": 8.219004572855848, "train_eval/perplexity_len_1024": 3710.8067057063945, "train_eval/loss_avg_len_512": 8.21855547720159, "train_eval/perplexity_len_512": 3709.1405726956223}
|
| 2 |
+
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 659.4609651200008, "train_eval/train_update_time": 358.814332414011, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.337175440577739, "train_eval/perplexity_len_2048": 565.197620381525, "train_eval/loss_avg_len_1024": 6.339329850406502, "train_eval/perplexity_len_1024": 566.4166003097482, "train_eval/loss_avg_len_512": 6.341749459894636, "train_eval/perplexity_len_512": 567.7887666740248}
|
| 3 |
+
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1061.4905991089763, "train_eval/train_update_time": 534.9619731742423, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.623096892892063, "train_eval/perplexity_len_2048": 276.7451074639398, "train_eval/loss_avg_len_1024": 5.627105757865829, "train_eval/perplexity_len_1024": 277.85676799159836, "train_eval/loss_avg_len_512": 5.634965539031764, "train_eval/perplexity_len_512": 280.0492663752382}
|
| 4 |
+
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1388.2071847780026, "train_eval/train_update_time": 711.1109357241658, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.218137195683321, "train_eval/perplexity_len_2048": 184.59000853965267, "train_eval/loss_avg_len_1024": 5.225316365856779, "train_eval/perplexity_len_1024": 185.91997995656686, "train_eval/loss_avg_len_512": 5.237964040004735, "train_eval/perplexity_len_512": 188.28636839096927}
|
| 5 |
+
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1792.9284177150112, "train_eval/train_update_time": 887.3467938142712, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.9624374085418275, "train_eval/perplexity_len_2048": 142.94177914951652, "train_eval/loss_avg_len_1024": 4.970113305729392, "train_eval/perplexity_len_1024": 144.04320736700222, "train_eval/loss_avg_len_512": 4.987048855513057, "train_eval/perplexity_len_512": 146.50343210389644}
|
| 6 |
+
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2123.1104451339925, "train_eval/train_update_time": 1063.6117942532874, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.778798732913674, "train_eval/perplexity_len_2048": 118.96135981119248, "train_eval/loss_avg_len_1024": 4.788676094550828, "train_eval/perplexity_len_1024": 120.14220640683598, "train_eval/loss_avg_len_512": 4.8105151809382365, "train_eval/perplexity_len_512": 122.79486279711149}
|
| 7 |
+
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2525.280265790003, "train_eval/train_update_time": 1239.831252818578, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.612123140184613, "train_eval/perplexity_len_2048": 100.69771821018394, "train_eval/loss_avg_len_1024": 4.629803386812637, "train_eval/perplexity_len_1024": 102.49391047489935, "train_eval/loss_avg_len_512": 4.662501077103371, "train_eval/perplexity_len_512": 105.90061681592375}
|
| 8 |
+
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2851.024827848014, "train_eval/train_update_time": 1415.9805577184306, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.448032398842888, "train_eval/perplexity_len_2048": 85.45862996984596, "train_eval/loss_avg_len_1024": 4.474290047360373, "train_eval/perplexity_len_1024": 87.73229252189076, "train_eval/loss_avg_len_512": 4.523296486837353, "train_eval/perplexity_len_512": 92.13883234754633}
|
| 9 |
+
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3252.3157649660134, "train_eval/train_update_time": 1592.1117133093649, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.298628560911212, "train_eval/perplexity_len_2048": 73.59878819895772, "train_eval/loss_avg_len_1024": 4.338705009182432, "train_eval/perplexity_len_1024": 76.60826807077736, "train_eval/loss_avg_len_512": 4.404042768209729, "train_eval/perplexity_len_512": 81.78082216081769}
|
| 10 |
+
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3578.0405803910107, "train_eval/train_update_time": 1768.2423270724248, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.191409433552872, "train_eval/perplexity_len_2048": 66.11591130529773, "train_eval/loss_avg_len_1024": 4.236702718509605, "train_eval/perplexity_len_1024": 69.17937150259523, "train_eval/loss_avg_len_512": 4.31055497860878, "train_eval/perplexity_len_512": 74.48181328533752}
|
| 11 |
+
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3979.0184022249887, "train_eval/train_update_time": 1944.3775492714485, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.112142655201096, "train_eval/perplexity_len_2048": 61.077445373302744, "train_eval/loss_avg_len_1024": 4.161546632732825, "train_eval/perplexity_len_1024": 64.17069438542758, "train_eval/loss_avg_len_512": 4.239235591167089, "train_eval/perplexity_len_512": 69.35481613679829}
|
| 12 |
+
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4305.46517852298, "train_eval/train_update_time": 2120.5240447262186, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.053491315687242, "train_eval/perplexity_len_2048": 57.59819991208716, "train_eval/loss_avg_len_1024": 4.104046444938321, "train_eval/perplexity_len_1024": 60.58494591305834, "train_eval/loss_avg_len_512": 4.186936804189099, "train_eval/perplexity_len_512": 65.82085965917027}
|
| 13 |
+
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4707.963436296966, "train_eval/train_update_time": 2296.670486221323, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.014058288484093, "train_eval/perplexity_len_2048": 55.371127214510835, "train_eval/loss_avg_len_1024": 4.071433403507817, "train_eval/perplexity_len_1024": 58.6409585331353, "train_eval/loss_avg_len_512": 4.15653280348517, "train_eval/perplexity_len_512": 63.849758711728725}
|
| 14 |
+
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5034.233835026971, "train_eval/train_update_time": 2472.8065605463926, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9816232878798563, "train_eval/perplexity_len_2048": 53.60397832907974, "train_eval/loss_avg_len_1024": 4.040070988443676, "train_eval/perplexity_len_1024": 56.83037696229216, "train_eval/loss_avg_len_512": 4.127506753519919, "train_eval/perplexity_len_512": 62.023091144379585}
|
| 15 |
+
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5434.649355244997, "train_eval/train_update_time": 2648.9550557791954, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.957951218417593, "train_eval/perplexity_len_2048": 52.34996236565334, "train_eval/loss_avg_len_1024": 4.017085574939046, "train_eval/perplexity_len_1024": 55.53900545747886, "train_eval/loss_avg_len_512": 4.105732457925805, "train_eval/perplexity_len_512": 60.6871790777178}
|
| 16 |
+
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5760.269799852977, "train_eval/train_update_time": 2825.100109060295, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9413603520746525, "train_eval/perplexity_len_2048": 51.48859629960878, "train_eval/loss_avg_len_1024": 4.002746569575137, "train_eval/perplexity_len_1024": 54.74831377406664, "train_eval/loss_avg_len_512": 4.091508979080463, "train_eval/perplexity_len_512": 59.83010599961991}
|
| 17 |
+
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6159.759159684996, "train_eval/train_update_time": 3001.234070145467, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.922204242174694, "train_eval/perplexity_len_2048": 50.51166209398698, "train_eval/loss_avg_len_1024": 3.977446037822138, "train_eval/perplexity_len_1024": 53.3805281360993, "train_eval/loss_avg_len_512": 4.066942860077361, "train_eval/perplexity_len_512": 58.37821912491155}
|
| 18 |
+
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6484.7479819079745, "train_eval/train_update_time": 3177.374425999529, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.919837444722207, "train_eval/perplexity_len_2048": 50.39225258562379, "train_eval/loss_avg_len_1024": 3.9784435200289954, "train_eval/perplexity_len_1024": 53.43380082797178, "train_eval/loss_avg_len_512": 4.068381745212573, "train_eval/perplexity_len_512": 58.462279138479424}
|
| 19 |
+
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6886.654366842995, "train_eval/train_update_time": 3353.5464223485906, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9090868059578225, "train_eval/perplexity_len_2048": 49.853405347373815, "train_eval/loss_avg_len_1024": 3.970852813320653, "train_eval/perplexity_len_1024": 53.02973602641771, "train_eval/loss_avg_len_512": 4.06019487478894, "train_eval/perplexity_len_512": 57.98560991148779}
|
metrics/jsonlines/val.jsonl
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 77.36710998200579, "val/train_update_time": 76.98762350907782, "val/loss": 7.967406083543483, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.46969969500788, "val/val_tokens_per_second": 542734.3710857432, "val/loss_avg_len_2048": 7.967406083543483, "val/perplexity_len_2048": 2885.3632576421255, "val/loss_avg_len_1024": 7.965887772902428, "val/perplexity_len_1024": 2880.9857039888375, "val/loss_avg_len_512": 7.965943124503736, "val/perplexity_len_512": 2881.1451755743637}
|
| 2 |
+
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 223.6157359869685, "val/train_update_time": 147.44638093106914, "val/loss": 7.078002173094731, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.52392348198919, "val/val_tokens_per_second": 549622.1627394476, "val/loss_avg_len_2048": 7.078002173094731, "val/perplexity_len_2048": 1185.5975322388967, "val/loss_avg_len_1024": 7.076746983557055, "val/perplexity_len_1024": 1184.1103161848112, "val/loss_avg_len_512": 7.077828987323959, "val/perplexity_len_512": 1185.3922213954218}
|
| 3 |
+
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 368.9031989739742, "val/train_update_time": 217.90960857702885, "val/loss": 6.593350867911941, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.5729669869761, "val/val_tokens_per_second": 549260.6993517304, "val/loss_avg_len_2048": 6.593350867911941, "val/perplexity_len_2048": 730.2236579069736, "val/loss_avg_len_1024": 6.592371494894009, "val/perplexity_len_1024": 729.5088466499291, "val/loss_avg_len_512": 6.59457370796809, "val/perplexity_len_512": 731.1171508339412}
|
| 4 |
+
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 514.2439901359612, "val/train_update_time": 288.3617576470133, "val/loss": 6.2121616636668335, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.46951410500333, "val/val_tokens_per_second": 550023.7310833756, "val/loss_avg_len_2048": 6.2121616636668335, "val/perplexity_len_2048": 498.77827766365175, "val/loss_avg_len_1024": 6.212113584863767, "val/perplexity_len_1024": 498.7542975775375, "val/loss_avg_len_512": 6.21575433148481, "val/perplexity_len_512": 500.5734451194017}
|
| 5 |
+
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 659.4609651200008, "val/train_update_time": 358.814332414011, "val/loss": 5.9063440953444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.66056790604489, "val/val_tokens_per_second": 548616.2394524683, "val/loss_avg_len_2048": 5.9063440953444, "val/perplexity_len_2048": 367.3606618559352, "val/loss_avg_len_1024": 5.907849762643875, "val/perplexity_len_1024": 367.9142014102337, "val/loss_avg_len_512": 5.914086231814698, "val/perplexity_len_512": 370.215856625777}
|
| 6 |
+
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 805.5059107720153, "val/train_update_time": 429.2700012290734, "val/loss": 5.654363232167089, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.21082512801513, "val/val_tokens_per_second": 544602.4549030361, "val/loss_avg_len_2048": 5.654363232167089, "val/perplexity_len_2048": 285.534605573886, "val/loss_avg_len_1024": 5.657451409234992, "val/perplexity_len_1024": 286.41774994614394, "val/loss_avg_len_512": 5.666161814958416, "val/perplexity_len_512": 288.92346179539464}
|
| 7 |
+
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 951.4780202080146, "val/train_update_time": 499.73381894716294, "val/loss": 5.471082407338126, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.61947528098244, "val/val_tokens_per_second": 548918.3600630208, "val/loss_avg_len_2048": 5.471082407338126, "val/perplexity_len_2048": 237.71736057137844, "val/loss_avg_len_1024": 5.47550174622531, "val/perplexity_len_1024": 238.7702389466763, "val/loss_avg_len_512": 5.486109551545978, "val/perplexity_len_512": 241.31654865962557}
|
| 8 |
+
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1096.8764924500138, "val/train_update_time": 570.1910730601521, "val/loss": 5.306349960088963, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.72595568199176, "val/val_tokens_per_second": 548136.1814134812, "val/loss_avg_len_2048": 5.306349960088963, "val/perplexity_len_2048": 201.6129882749498, "val/loss_avg_len_1024": 5.312019983144896, "val/perplexity_len_1024": 202.75938554501306, "val/loss_avg_len_512": 5.324348847681005, "val/perplexity_len_512": 205.27465187158018}
|
| 9 |
+
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1242.364284639014, "val/train_update_time": 640.6497024221462, "val/loss": 5.184559677005536, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.06923581298906, "val/val_tokens_per_second": 545629.6385118228, "val/loss_avg_len_2048": 5.184559677005536, "val/perplexity_len_2048": 178.49483710243962, "val/loss_avg_len_1024": 5.191356708838465, "val/perplexity_len_1024": 179.712204748735, "val/loss_avg_len_512": 5.2056767437635925, "val/perplexity_len_512": 182.30420426547653}
|
| 10 |
+
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1388.2071847780026, "val/train_update_time": 711.1109357241658, "val/loss": 5.06955635641932, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.47714675101452, "val/val_tokens_per_second": 549967.362967514, "val/loss_avg_len_2048": 5.06955635641932, "val/perplexity_len_2048": 159.10372633676187, "val/loss_avg_len_1024": 5.077621888072462, "val/perplexity_len_1024": 160.39217149050972, "val/loss_avg_len_512": 5.09403516663406, "val/perplexity_len_512": 163.04645604872834}
|
| 11 |
+
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 1534.1124866969767, "val/train_update_time": 781.5830888972268, "val/loss": 4.975544349937421, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.62435562204337, "val/val_tokens_per_second": 541624.4497303297, "val/loss_avg_len_2048": 4.975544349937421, "val/perplexity_len_2048": 144.82764061174217, "val/loss_avg_len_1024": 4.984868058896298, "val/perplexity_len_1024": 146.18428603763627, "val/loss_avg_len_512": 5.0030605386967775, "val/perplexity_len_512": 148.8680791141717}
|
| 12 |
+
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 1680.500316324993, "val/train_update_time": 852.0965187721886, "val/loss": 4.895440164133744, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 77.0330984640168, "val/val_tokens_per_second": 531719.4922275257, "val/loss_avg_len_2048": 4.895440164133744, "val/perplexity_len_2048": 133.67883429497678, "val/loss_avg_len_1024": 4.9060379521952475, "val/perplexity_len_1024": 135.10306778148092, "val/loss_avg_len_512": 4.926302171780263, "val/perplexity_len_512": 137.86875360523445}
|
| 13 |
+
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 1828.322631730989, "val/train_update_time": 922.6015984143014, "val/loss": 4.819560142311337, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 76.82710023201071, "val/val_tokens_per_second": 533145.2036625696, "val/loss_avg_len_2048": 4.819560142311337, "val/perplexity_len_2048": 123.91057577182025, "val/loss_avg_len_1024": 4.8313868719966155, "val/perplexity_len_1024": 125.38473270170651, "val/loss_avg_len_512": 4.853890417815373, "val/perplexity_len_512": 128.23832128707267}
|
| 14 |
+
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 1975.955018882989, "val/train_update_time": 993.1225579883321, "val/loss": 4.752174282312161, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 76.37709301302675, "val/val_tokens_per_second": 536286.4490406035, "val/loss_avg_len_2048": 4.752174282312161, "val/perplexity_len_2048": 115.8358708032794, "val/loss_avg_len_1024": 4.765576768405783, "val/perplexity_len_1024": 117.39880968881805, "val/loss_avg_len_512": 4.790581661829166, "val/perplexity_len_512": 120.37136373211975}
|
| 15 |
+
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 2123.1104451339925, "val/train_update_time": 1063.6117942532874, "val/loss": 4.689406985081849, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.71957862901036, "val/val_tokens_per_second": 548182.9629068199, "val/loss_avg_len_2048": 4.689406985081849, "val/perplexity_len_2048": 108.78864738413802, "val/loss_avg_len_1024": 4.704826454977226, "val/perplexity_len_1024": 110.47911018398477, "val/loss_avg_len_512": 4.733361966823415, "val/perplexity_len_512": 113.67709926837894}
|
| 16 |
+
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 2269.22137821099, "val/train_update_time": 1134.096047840314, "val/loss": 4.626646096304082, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.62473024002975, "val/val_tokens_per_second": 548879.7060740124, "val/loss_avg_len_2048": 4.626646096304082, "val/perplexity_len_2048": 102.17081774137807, "val/loss_avg_len_1024": 4.644601592122112, "val/perplexity_len_1024": 104.02191437851869, "val/loss_avg_len_512": 4.677183249155526, "val/perplexity_len_512": 107.4669382558201}
|
| 17 |
+
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 2414.640490865975, "val/train_update_time": 1204.5815564935328, "val/loss": 4.56134519904966, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.26538718299707, "val/val_tokens_per_second": 544207.6568398112, "val/loss_avg_len_2048": 4.56134519904966, "val/perplexity_len_2048": 95.71214515713238, "val/loss_avg_len_1024": 4.582940916776005, "val/perplexity_len_1024": 97.80159803916156, "val/loss_avg_len_512": 4.621313081837446, "val/perplexity_len_512": 101.62738963524231}
|
| 18 |
+
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 2560.696846612962, "val/train_update_time": 1275.0923811426037, "val/loss": 4.491432455242611, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.28613866801606, "val/val_tokens_per_second": 551381.4654312534, "val/loss_avg_len_2048": 4.491432455242611, "val/perplexity_len_2048": 89.2491998401074, "val/loss_avg_len_1024": 4.517822659327136, "val/perplexity_len_1024": 91.63585812522578, "val/loss_avg_len_512": 4.562794736436661, "val/perplexity_len_512": 95.8509840917308}
|
| 19 |
+
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 2705.7711879389826, "val/train_update_time": 1345.540644572582, "val/loss": 4.430973563183867, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.47222723404411, "val/val_tokens_per_second": 550003.692937434, "val/loss_avg_len_2048": 4.430973563183867, "val/perplexity_len_2048": 84.01316923681291, "val/loss_avg_len_1024": 4.462146958403895, "val/perplexity_len_1024": 86.67339365261898, "val/loss_avg_len_512": 4.513480652339757, "val/perplexity_len_512": 91.23883714663683}
|
| 20 |
+
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 2851.024827848014, "val/train_update_time": 1415.9805577184306, "val/loss": 4.366317664692108, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.15898946800735, "val/val_tokens_per_second": 552326.8358136191, "val/loss_avg_len_2048": 4.366317664692108, "val/perplexity_len_2048": 78.7531017862609, "val/loss_avg_len_1024": 4.40270962580326, "val/perplexity_len_1024": 81.67186931972208, "val/loss_avg_len_512": 4.461183923999499, "val/perplexity_len_512": 86.58996437166769}
|
| 21 |
+
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 2996.6310340749915, "val/train_update_time": 1486.4294549234328, "val/loss": 4.30756352697448, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.16067498200573, "val/val_tokens_per_second": 544965.834990256, "val/loss_avg_len_2048": 4.30756352697448, "val/perplexity_len_2048": 74.25933747198452, "val/loss_avg_len_1024": 4.348377340535215, "val/perplexity_len_1024": 77.35284371024551, "val/loss_avg_len_512": 4.4122849026547755, "val/perplexity_len_512": 82.45765613881858}
|
| 22 |
+
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 3142.596222635999, "val/train_update_time": 1556.8869810854085, "val/loss": 4.260864559826906, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.31724469002802, "val/val_tokens_per_second": 551150.6807180657, "val/loss_avg_len_2048": 4.260864559826906, "val/perplexity_len_2048": 70.87122939288916, "val/loss_avg_len_1024": 4.305188920119405, "val/perplexity_len_1024": 74.0832099397546, "val/loss_avg_len_512": 4.373845319927856, "val/perplexity_len_512": 79.34816489023441}
|
| 23 |
+
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 3287.7011176730157, "val/train_update_time": 1627.3387761343038, "val/loss": 4.223266492512053, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.42460210697027, "val/val_tokens_per_second": 550355.6463913412, "val/loss_avg_len_2048": 4.223266492512053, "val/perplexity_len_2048": 68.25607850967396, "val/loss_avg_len_1024": 4.2713793615476225, "val/perplexity_len_1024": 71.62035788441096, "val/loss_avg_len_512": 4.34352318198774, "val/perplexity_len_512": 76.97827059691846}
|
| 24 |
+
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 3432.890017031983, "val/train_update_time": 1697.7829640183481, "val/loss": 4.18504595848294, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.3746191120008, "val/val_tokens_per_second": 550725.5094418473, "val/loss_avg_len_2048": 4.18504595848294, "val/perplexity_len_2048": 65.69652015975821, "val/loss_avg_len_1024": 4.235277974668006, "val/perplexity_len_1024": 69.08087879917835, "val/loss_avg_len_512": 4.3104049167047265, "val/perplexity_len_512": 74.4706372411888}
|
| 25 |
+
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 3578.0405803910107, "val/train_update_time": 1768.2423270724248, "val/loss": 4.149188061488955, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.34305373596726, "val/val_tokens_per_second": 550959.3424218395, "val/loss_avg_len_2048": 4.149188061488955, "val/perplexity_len_2048": 63.38251669396044, "val/loss_avg_len_1024": 4.201489808351174, "val/perplexity_len_1024": 66.78575493670466, "val/loss_avg_len_512": 4.279306911751535, "val/perplexity_len_512": 72.19038835440217}
|
| 26 |
+
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 3723.92296086601, "val/train_update_time": 1838.693349173409, "val/loss": 4.128686446135375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.46465942100622, "val/val_tokens_per_second": 550059.5895889552, "val/loss_avg_len_2048": 4.128686446135375, "val/perplexity_len_2048": 62.096302501948784, "val/loss_avg_len_1024": 4.183405997985183, "val/perplexity_len_1024": 65.58886875800643, "val/loss_avg_len_512": 4.264364499987476, "val/perplexity_len_512": 71.11970903315873}
|
| 27 |
+
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 3869.1690179569996, "val/train_update_time": 1909.1470028955955, "val/loss": 4.098170458009047, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.46465041401098, "val/val_tokens_per_second": 550059.6561223246, "val/loss_avg_len_2048": 4.098170458009047, "val/perplexity_len_2048": 60.229993432124694, "val/loss_avg_len_1024": 4.1536520937833465, "val/perplexity_len_1024": 63.66609076637863, "val/loss_avg_len_512": 4.235404257816635, "val/perplexity_len_512": 69.08960310091771}
|
| 28 |
+
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 4014.40222099697, "val/train_update_time": 1979.6119488844415, "val/loss": 4.072737031718065, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.7946821290534, "val/val_tokens_per_second": 547632.5165648296, "val/loss_avg_len_2048": 4.072737031718065, "val/perplexity_len_2048": 58.717454391200924, "val/loss_avg_len_1024": 4.1293711922524965, "val/perplexity_len_1024": 62.138837265069576, "val/loss_avg_len_512": 4.2120003426606765, "val/perplexity_len_512": 67.4914108193186}
|
| 29 |
+
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 4159.974407147965, "val/train_update_time": 2050.0642815562896, "val/loss": 4.053922182414727, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.71466798300389, "val/val_tokens_per_second": 548218.9924114712, "val/loss_avg_len_2048": 4.053922182414727, "val/perplexity_len_2048": 57.623022407201915, "val/loss_avg_len_1024": 4.111342828695057, "val/perplexity_len_1024": 61.02861354467073, "val/loss_avg_len_512": 4.195094090398866, "val/perplexity_len_512": 66.35997512008933}
|
| 30 |
+
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 4305.46517852298, "val/train_update_time": 2120.5240447262186, "val/loss": 4.036319210487535, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.65241524501471, "val/val_tokens_per_second": 548676.1528822111, "val/loss_avg_len_2048": 4.036319210487535, "val/perplexity_len_2048": 56.61756147462313, "val/loss_avg_len_1024": 4.0947161903618845, "val/perplexity_len_1024": 60.022301832124555, "val/loss_avg_len_512": 4.179663519956824, "val/perplexity_len_512": 65.34386260880561}
|
| 31 |
+
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 4451.645092452003, "val/train_update_time": 2190.979344193125, "val/loss": 4.018702121205884, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.97793793701567, "val/val_tokens_per_second": 546294.0316444547, "val/loss_avg_len_2048": 4.018702121205884, "val/perplexity_len_2048": 55.62885943640474, "val/loss_avg_len_1024": 4.077695212568948, "val/perplexity_len_1024": 59.00930908545214, "val/loss_avg_len_512": 4.163278880037181, "val/perplexity_len_512": 64.28195023129467}
|
| 32 |
+
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 4597.405511869991, "val/train_update_time": 2261.444451206189, "val/loss": 4.006759247975774, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.16958679695381, "val/val_tokens_per_second": 544901.2259524602, "val/loss_avg_len_2048": 4.006759247975774, "val/perplexity_len_2048": 54.96844250775819, "val/loss_avg_len_1024": 4.067678513650177, "val/perplexity_len_1024": 58.421181071002465, "val/loss_avg_len_512": 4.1546929989802655, "val/perplexity_len_512": 63.73239563367866}
|
| 33 |
+
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 4743.360288269003, "val/train_update_time": 2331.894208611455, "val/loss": 3.992493032265781, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.69443514704471, "val/val_tokens_per_second": 548367.4910904067, "val/loss_avg_len_2048": 3.992493032265781, "val/perplexity_len_2048": 54.189818067555, "val/loss_avg_len_1024": 4.053030727527477, "val/perplexity_len_1024": 57.57167697172892, "val/loss_avg_len_512": 4.140326388037205, "val/perplexity_len_512": 62.823322884355}
|
| 34 |
+
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 4888.828857862973, "val/train_update_time": 2402.3454629034386, "val/loss": 3.9821310220196846, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.63417414203286, "val/val_tokens_per_second": 548810.2530893007, "val/loss_avg_len_2048": 3.9821310220196846, "val/perplexity_len_2048": 53.631201809468386, "val/loss_avg_len_1024": 4.042898476167862, "val/perplexity_len_1024": 56.99129154030574, "val/loss_avg_len_512": 4.13046144102756, "val/perplexity_len_512": 62.20662100013452}
|
| 35 |
+
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 5034.233835026971, "val/train_update_time": 2472.8065605463926, "val/loss": 3.9720667837841903, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.0003811760107, "val/val_tokens_per_second": 553510.6623650518, "val/loss_avg_len_2048": 3.9720667837841903, "val/perplexity_len_2048": 53.09415165106492, "val/loss_avg_len_1024": 4.034272228109325, "val/perplexity_len_1024": 56.501784860758214, "val/loss_avg_len_512": 4.122892943188642, "val/perplexity_len_512": 61.73758750197274}
|
| 36 |
+
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 5179.741817264992, "val/train_update_time": 2543.2725221014116, "val/loss": 3.9630798968119314, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.03690093499608, "val/val_tokens_per_second": 553237.6353240745, "val/loss_avg_len_2048": 3.9630798968119314, "val/perplexity_len_2048": 52.619138154590686, "val/loss_avg_len_1024": 4.025157931512734, "val/perplexity_len_1024": 55.98915053467066, "val/loss_avg_len_512": 4.1138680682414215, "val/perplexity_len_512": 61.18292016164027}
|
| 37 |
+
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 5324.524823835993, "val/train_update_time": 2613.7285769192385, "val/loss": 3.9559251724027797, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.74004035303369, "val/val_tokens_per_second": 548032.8858069374, "val/loss_avg_len_2048": 3.9559251724027797, "val/perplexity_len_2048": 52.24400630519839, "val/loss_avg_len_1024": 4.017918658130476, "val/perplexity_len_1024": 55.58529334754871, "val/loss_avg_len_512": 4.106723456612043, "val/perplexity_len_512": 60.74734980208532}
|
| 38 |
+
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 5470.030363440979, "val/train_update_time": 2684.182179984171, "val/loss": 3.94825802866444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.59196932800114, "val/val_tokens_per_second": 549120.7749173072, "val/loss_avg_len_2048": 3.94825802866444, "val/perplexity_len_2048": 51.84497566677264, "val/loss_avg_len_1024": 4.0108845364942685, "val/perplexity_len_1024": 55.195671562972564, "val/loss_avg_len_512": 4.100306030388736, "val/perplexity_len_512": 60.35875638489279}
|
| 39 |
+
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 5615.37508792599, "val/train_update_time": 2754.6453251581406, "val/loss": 3.9421704971529783, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.13637772900984, "val/val_tokens_per_second": 552495.2965697999, "val/loss_avg_len_2048": 3.9421704971529783, "val/perplexity_len_2048": 51.53032643393987, "val/loss_avg_len_1024": 4.005375609245478, "val/perplexity_len_1024": 54.89243863483703, "val/loss_avg_len_512": 4.095269862245862, "val/perplexity_len_512": 60.05554369475379}
|
| 40 |
+
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 5760.269799852977, "val/train_update_time": 2825.100109060295, "val/loss": 3.9371402649512284, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.03845945402281, "val/val_tokens_per_second": 553225.9896011987, "val/loss_avg_len_2048": 3.9371402649512284, "val/perplexity_len_2048": 51.271767776784145, "val/loss_avg_len_1024": 4.000664330457105, "val/perplexity_len_1024": 54.63443329781732, "val/loss_avg_len_512": 4.090735746535101, "val/perplexity_len_512": 59.78386129572768}
|
| 41 |
+
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 5905.784063692961, "val/train_update_time": 2895.5508844144642, "val/loss": 3.932787389914203, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 73.87577835295815, "val/val_tokens_per_second": 554444.2429331084, "val/loss_avg_len_2048": 3.932787389914203, "val/perplexity_len_2048": 51.04907321115793, "val/loss_avg_len_1024": 3.9963908482754142, "val/perplexity_len_1024": 54.40145219547801, "val/loss_avg_len_512": 4.086696723492723, "val/perplexity_len_512": 59.54287989471592}
|
| 42 |
+
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 6050.427405752998, "val/train_update_time": 2966.012128848466, "val/loss": 3.9291096620217902, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 73.96407199202804, "val/val_tokens_per_second": 553782.3824033746, "val/loss_avg_len_2048": 3.9291096620217902, "val/perplexity_len_2048": 50.86167342466414, "val/loss_avg_len_1024": 3.992870327184024, "val/perplexity_len_1024": 54.210267468017264, "val/loss_avg_len_512": 4.083310494290107, "val/perplexity_len_512": 59.34159504666062}
|
| 43 |
+
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 6195.132035595016, "val/train_update_time": 3036.459684428468, "val/loss": 3.9261094075139615, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.03585776797263, "val/val_tokens_per_second": 553245.4304557135, "val/loss_avg_len_2048": 3.9261094075139615, "val/perplexity_len_2048": 50.70930414729804, "val/loss_avg_len_1024": 3.990283828539308, "val/perplexity_len_1024": 54.070233861117224, "val/loss_avg_len_512": 4.080950855814386, "val/perplexity_len_512": 59.20173540976174}
|
| 44 |
+
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 6339.925039035967, "val/train_update_time": 3106.918331800378, "val/loss": 3.923576743226429, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.07839193200925, "val/val_tokens_per_second": 552927.769241994, "val/loss_avg_len_2048": 3.923576743226429, "val/perplexity_len_2048": 50.58103700101182, "val/loss_avg_len_1024": 3.9875462732635443, "val/perplexity_len_1024": 53.92241602920654, "val/loss_avg_len_512": 4.07817962241387, "val/perplexity_len_512": 59.03790070018595}
|
| 45 |
+
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 6484.7479819079745, "val/train_update_time": 3177.374425999529, "val/loss": 3.9217875044056916, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.16108567302581, "val/val_tokens_per_second": 552311.22398331, "val/loss_avg_len_2048": 3.9217875044056916, "val/perplexity_len_2048": 50.49061636219757, "val/loss_avg_len_1024": 3.9858806951731913, "val/perplexity_len_1024": 53.83267878742668, "val/loss_avg_len_512": 4.076671612372063, "val/perplexity_len_512": 58.94893804822824}
|
| 46 |
+
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 6630.405630423978, "val/train_update_time": 3247.8434043628513, "val/loss": 3.9204105864164656, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.36959133000346, "val/val_tokens_per_second": 550762.7414307871, "val/loss_avg_len_2048": 3.9204105864164656, "val/perplexity_len_2048": 50.42114276494054, "val/loss_avg_len_1024": 3.9845006187881813, "val/perplexity_len_1024": 53.75843682026665, "val/loss_avg_len_512": 4.075279883826617, "val/perplexity_len_512": 58.866954191292706}
|
| 47 |
+
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 6775.5438796009985, "val/train_update_time": 3318.3166800447507, "val/loss": 3.9195347058722283, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.7266165559995, "val/val_tokens_per_second": 540893.0421407414, "val/loss_avg_len_2048": 3.9195347058722283, "val/perplexity_len_2048": 50.37699920204059, "val/loss_avg_len_1024": 3.983747386692418, "val/perplexity_len_1024": 53.71795948656276, "val/loss_avg_len_512": 4.074615421833098, "val/perplexity_len_512": 58.8278523298474}
|
| 48 |
+
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 6922.033780722995, "val/train_update_time": 3388.7756955446093, "val/loss": 3.919046544265724, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.74806076998357, "val/val_tokens_per_second": 547974.0822446625, "val/loss_avg_len_2048": 3.919046544265724, "val/perplexity_len_2048": 50.35241308666628, "val/loss_avg_len_1024": 3.9832005327720195, "val/perplexity_len_1024": 53.68859164051448, "val/loss_avg_len_512": 4.074043546199799, "val/perplexity_len_512": 58.79421973228877}
|
| 49 |
+
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 7067.550158863014, "val/train_update_time": 3459.239731548645, "val/loss": 3.9187872609187617, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 73.89566232298966, "val/val_tokens_per_second": 554295.0521367333, "val/loss_avg_len_2048": 3.9187872609187617, "val/perplexity_len_2048": 50.33935923686961, "val/loss_avg_len_1024": 3.9829653370987623, "val/perplexity_len_1024": 53.67596580088699, "val/loss_avg_len_512": 4.073823164884653, "val/perplexity_len_512": 58.78126401247289}
|
metrics/jsonlines/val_data_info.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"step": 0, "val_data_info/vocab_size": 50277, "val_data_info/global_tokens_per_batch": 2048, "val_data_info/local_tokens_per_batch": 2048, "val_data_info/batch_len": 2048, "val_data_info/seq_len": 2048, "val_data_info/total_tokens": 2147483648, "val_data_info/global_batch_size": 1, "val_data_info/local_batch_size": 1}
|
metrics/npz/train_eval/step-000000104857600.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:911e3d478c0409f505fdef81e34a1f42902fcecf876b4035196c3060b9cb850e
|
| 3 |
+
size 20540
|
metrics/npz/train_eval/step-000000209715200.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3bb71425e9ac7519e9e304cf86d7cc6309f3793475584fa52f3fac2acecb8bc
|
| 3 |
+
size 20540
|