diff --git a/.hydra/config.yaml b/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ac10d290bb629e8f3681bdd77ab8b7095b8c785 --- /dev/null +++ b/.hydra/config.yaml @@ -0,0 +1,102 @@ +model: + _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM + config: + _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig + vocab_size: ??? + hidden_size: 256 + hidden_ratio: 4.0 + intermediate_size: null + num_hidden_layers: 2 + num_heads: 4 + num_kv_heads: null + hidden_act: swish + window_size: null + max_position_embeddings: null + initializer_range: 0.02 + elementwise_affine: true + norm_eps: 1.0e-06 + use_cache: true + pad_token_id: null + bos_token_id: null + eos_token_id: null + tie_word_embeddings: false + attention_bias: false + fuse_norm: true + fuse_cross_entropy: true + rope_base: 500000.0 + use_rope: false + use_output_gate: false + ogate_act: sigmoid + fgate_type: full + fgate_bias_init: false + decay_time_min: null + decay_time_max: null + use_output_norm: false + qk_norm: false + qk_norm_share_param_across_head: false + use_k_shift: false + use_v_shift: false +optimizer: + _target_: torch.optim.AdamW + lr: 0.001 + betas: + - 0.9 + - 0.95 + weight_decay: 0.1 +schedule: + _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule + init_value: 0.0 + peak_value: ${optimizer.lr} + warmup_steps: 20971520 + decay_steps: ${train.max_tokens} + end_value: 0.0 +datamodule: + _target_: forgetting_transformer.datamodule.npy.NpyDataModule + data_path: ${data_dir} + rank: ??? + world_size: ??? + train_batch_len: 2048 + train_batch_size: 1024 + train_num_workers: 0 + eval_tokens: 2147483648 + eval_batch_len: 2048 + eval_local_batch_size: 1 + eval_num_workers: 0 +strategy: + _target_: lightning.fabric.strategies.SingleDeviceStrategy + device: cuda:0 +exp: forgetting_gate_2_4_256 +tag: forgetting_gate_2_4_256 +seed: 42 +hf_load_dir: null +hf_save_dir: null +hf_load_step: null +output_dir: ./forgetting_gate_2_4_256/ +data_dir: /workspace/forgetting-transformer/data +resume: false +fork_dir: null +fork_step: null +log_interval: 20971520 +eval_interval: 41943040 +final_eval: true +skip_eval: false +checkpoint_interval: 209715200 +train_eval_interval: 104857600 +checkpoint_keep_interval: 209715200 +fabric: + devices: 1 + precision: 16-mixed +train: + max_tokens: 2097152000 + grad_acc_tokens: 32768 + max_grad_norm: 1.0 + gradient_checkpointing: true + bias_weight_decay: false + normalization_weight_decay: false + conv_weight_decay: true +eval: + min_val_length: 512 +wandb: + project: forgetting-transformer + mode: online + log_dir: ./output/wandb diff --git a/.hydra/hydra.yaml b/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2f3458394e2d9502d53a246791c4e012bfb973c --- /dev/null +++ b/.hydra/hydra.yaml @@ -0,0 +1,146 @@ +hydra: + run: + dir: ${output_dir} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + root: null + disable_existing_loggers: false + job_logging: + version: 1 + root: null + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - +experiment/pile/forgetting_transformer=forgetting_gate_2_4_256 + - strategy=single_device + - datamodule=npy + - schedule=warmup_cosine + - optimizer=adamw + - model=forgetting_transformer + - data_dir=/workspace/forgetting-transformer/data + - fabric.devices=1 + - fabric.precision=16-mixed + - seed=42 + - exp=forgetting_gate_2_4_256 + - tag=forgetting_gate_2_4_256 + - output_dir=./forgetting_gate_2_4_256/ + - wandb.log_dir=./output/wandb + - wandb.mode=online + - resume=false + job: + name: train + chdir: null + override_dirname: +experiment/pile/forgetting_transformer=forgetting_gate_2_4_256,data_dir=/workspace/forgetting-transformer/data,datamodule=npy,exp=forgetting_gate_2_4_256,fabric.devices=1,fabric.precision=16-mixed,model=forgetting_transformer,optimizer=adamw,output_dir=./forgetting_gate_2_4_256/,resume=false,schedule=warmup_cosine,seed=42,strategy=single_device,tag=forgetting_gate_2_4_256,wandb.log_dir=./output/wandb,wandb.mode=online + id: ??? + num: ??? + config_name: config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /workspace/forgetting-transformer + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /workspace/forgetting-transformer/configs + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /workspace/forgetting-transformer/forgetting_gate_2_4_256 + choices: + experiment/pile/forgetting_transformer: forgetting_gate_2_4_256 + strategy: single_device + datamodule: npy + schedule: warmup_cosine + optimizer: adamw + model: forgetting_transformer + hydra/env: default + hydra/callbacks: null + hydra/job_logging: none + hydra/hydra_logging: none + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/.hydra/overrides.yaml b/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a17a82a41a0e887fa8eb790b034127378b393eac --- /dev/null +++ b/.hydra/overrides.yaml @@ -0,0 +1,16 @@ +- +experiment/pile/forgetting_transformer=forgetting_gate_2_4_256 +- strategy=single_device +- datamodule=npy +- schedule=warmup_cosine +- optimizer=adamw +- model=forgetting_transformer +- data_dir=/workspace/forgetting-transformer/data +- fabric.devices=1 +- fabric.precision=16-mixed +- seed=42 +- exp=forgetting_gate_2_4_256 +- tag=forgetting_gate_2_4_256 +- output_dir=./forgetting_gate_2_4_256/ +- wandb.log_dir=./output/wandb +- wandb.mode=online +- resume=false diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7f434981dc7bf9b220ed13f2cf53f70c18da7df0 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +# for HF remote code diff --git a/__pycache__/__init__.cpython-310.pyc b/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa6fc1efccec774fae6b6dc03279f15a36467852 Binary files /dev/null and b/__pycache__/__init__.cpython-310.pyc differ diff --git a/__pycache__/configuration_transformer.cpython-310.pyc b/__pycache__/configuration_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb235c4360610154373c138bbdaa933131675b9e Binary files /dev/null and b/__pycache__/configuration_transformer.cpython-310.pyc differ diff --git a/__pycache__/modeling_transformer.cpython-310.pyc b/__pycache__/modeling_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f86d01b5bf6302ce831f5b6f7d092301d79b7fe Binary files /dev/null and b/__pycache__/modeling_transformer.cpython-310.pyc differ diff --git a/checkpoints/step-000000209715200.pt b/checkpoints/step-000000209715200.pt new file mode 100644 index 0000000000000000000000000000000000000000..443c68c883d9a4503908fc123dfb7858c5db6c13 --- /dev/null +++ b/checkpoints/step-000000209715200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:236c34edce0f811562e4cd1b530249c5cf4d9fdb512e24b0b97bf259dbc1998e +size 329435138 diff --git a/checkpoints/step-000000209715200.pt.done b/checkpoints/step-000000209715200.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000209715200.pt.keep b/checkpoints/step-000000209715200.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000419430400.pt b/checkpoints/step-000000419430400.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ded9e50b9c77c5f0aeab382883d2092b6f8074c --- /dev/null +++ b/checkpoints/step-000000419430400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5df232ebcfcd25a99dcd36e6570530238148ed72180e864a4ead2122836f2ed +size 329435138 diff --git a/checkpoints/step-000000419430400.pt.done b/checkpoints/step-000000419430400.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000419430400.pt.keep b/checkpoints/step-000000419430400.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000629145600.pt b/checkpoints/step-000000629145600.pt new file mode 100644 index 0000000000000000000000000000000000000000..161363f310cc80ca4f7337e823c72fdef634dc53 --- /dev/null +++ b/checkpoints/step-000000629145600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:207706e754abd9271e6622c358b1a436015fd842dce3c12e836cb277ae7c08c1 +size 329435138 diff --git a/checkpoints/step-000000629145600.pt.done b/checkpoints/step-000000629145600.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000629145600.pt.keep b/checkpoints/step-000000629145600.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000838860800.pt b/checkpoints/step-000000838860800.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fe2a7714dbfd71ad6747a3f58751065b5db2780 --- /dev/null +++ b/checkpoints/step-000000838860800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48c509f4a5f1b180e709297ef3e0588d0ba83b6e6d386b55467f408f491dec6 +size 329435138 diff --git a/checkpoints/step-000000838860800.pt.done b/checkpoints/step-000000838860800.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000838860800.pt.keep b/checkpoints/step-000000838860800.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001048576000.pt b/checkpoints/step-000001048576000.pt new file mode 100644 index 0000000000000000000000000000000000000000..20730c598fcb402d0ff3adbcdbcc76a978151c1d --- /dev/null +++ b/checkpoints/step-000001048576000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ce74d91ba505dca661cfacf0585880e17a1d9490fee076fb64c72348ddd1a65 +size 329435138 diff --git a/checkpoints/step-000001048576000.pt.done b/checkpoints/step-000001048576000.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001048576000.pt.keep b/checkpoints/step-000001048576000.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001258291200.pt b/checkpoints/step-000001258291200.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c2b2dca826fce5f63db391943eff485b509aa79 --- /dev/null +++ b/checkpoints/step-000001258291200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75567783b278f9fa3d3cc02888962f3f373895ab53304e35338a83bda3aa6460 +size 329435138 diff --git a/checkpoints/step-000001258291200.pt.done b/checkpoints/step-000001258291200.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001258291200.pt.keep b/checkpoints/step-000001258291200.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001468006400.pt b/checkpoints/step-000001468006400.pt new file mode 100644 index 0000000000000000000000000000000000000000..85dc879a2d6004cab923e23a70401ad9f6b083fd --- /dev/null +++ b/checkpoints/step-000001468006400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9038cf9d54a8c8003344e57e6cf85164e59317b0a0a1411e2cac8bd6dab4169 +size 329435138 diff --git a/checkpoints/step-000001468006400.pt.done b/checkpoints/step-000001468006400.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001468006400.pt.keep b/checkpoints/step-000001468006400.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001677721600.pt b/checkpoints/step-000001677721600.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e11081030362a061abf0c2c894cb13d9c7babb4 --- /dev/null +++ b/checkpoints/step-000001677721600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f044b1ff3f9e99f65fcfcda0cc2239765b4e1d71f92ba0aa50b7b0473030001 +size 329435138 diff --git a/checkpoints/step-000001677721600.pt.done b/checkpoints/step-000001677721600.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001677721600.pt.keep b/checkpoints/step-000001677721600.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001887436800.pt b/checkpoints/step-000001887436800.pt new file mode 100644 index 0000000000000000000000000000000000000000..005abbf38b478e8107de6d1241ca9878d59df1b0 --- /dev/null +++ b/checkpoints/step-000001887436800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8dde4cb869035248b40f163b7e3ad4ba9b683e1f6432ff1a706b42bf1e173c3 +size 329435138 diff --git a/checkpoints/step-000001887436800.pt.done b/checkpoints/step-000001887436800.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001887436800.pt.keep b/checkpoints/step-000001887436800.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1618f3bcbb219cf74dfa14e6b9eb16122fee1b0c --- /dev/null +++ b/config.yaml @@ -0,0 +1,102 @@ +model: + _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM + config: + _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig + vocab_size: ??? + hidden_size: 256 + hidden_ratio: 4.0 + intermediate_size: null + num_hidden_layers: 2 + num_heads: 4 + num_kv_heads: null + hidden_act: swish + window_size: null + max_position_embeddings: null + initializer_range: 0.02 + elementwise_affine: true + norm_eps: 1.0e-06 + use_cache: true + pad_token_id: null + bos_token_id: null + eos_token_id: null + tie_word_embeddings: false + attention_bias: false + fuse_norm: true + fuse_cross_entropy: true + rope_base: 500000.0 + use_rope: false + use_output_gate: false + ogate_act: sigmoid + fgate_type: full + fgate_bias_init: false + decay_time_min: null + decay_time_max: null + use_output_norm: false + qk_norm: false + qk_norm_share_param_across_head: false + use_k_shift: false + use_v_shift: false +optimizer: + _target_: torch.optim.AdamW + lr: 0.001 + betas: + - 0.9 + - 0.95 + weight_decay: 0.1 +schedule: + _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule + init_value: 0.0 + peak_value: 0.001 + warmup_steps: 20971520 + decay_steps: 2097152000 + end_value: 0.0 +datamodule: + _target_: forgetting_transformer.datamodule.npy.NpyDataModule + data_path: /workspace/forgetting-transformer/data + rank: ??? + world_size: ??? + train_batch_len: 2048 + train_batch_size: 1024 + train_num_workers: 0 + eval_tokens: 2147483648 + eval_batch_len: 2048 + eval_local_batch_size: 1 + eval_num_workers: 0 +strategy: + _target_: lightning.fabric.strategies.SingleDeviceStrategy + device: cuda:0 +exp: forgetting_gate_2_4_256 +tag: forgetting_gate_2_4_256 +seed: 42 +hf_load_dir: null +hf_save_dir: null +hf_load_step: null +output_dir: /workspace/forgetting-transformer/forgetting_gate_2_4_256 +data_dir: /workspace/forgetting-transformer/data +resume: false +fork_dir: null +fork_step: null +log_interval: 20971520 +eval_interval: 41943040 +final_eval: true +skip_eval: false +checkpoint_interval: 209715200 +train_eval_interval: 104857600 +checkpoint_keep_interval: 209715200 +fabric: + devices: 1 + precision: 16-mixed +train: + max_tokens: 2097152000 + grad_acc_tokens: 32768 + max_grad_norm: 1.0 + gradient_checkpointing: true + bias_weight_decay: false + normalization_weight_decay: false + conv_weight_decay: true +eval: + min_val_length: 512 +wandb: + project: forgetting-transformer + mode: online + log_dir: ./output/wandb diff --git a/configuration_transformer.py b/configuration_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..4b6767a5dcc859f307966491b13c2e44b35d8176 --- /dev/null +++ b/configuration_transformer.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class TransformerConfig(PretrainedConfig): + + model_type = 'transformer-project_fox' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + vocab_size: int = 32000, + hidden_size: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + num_hidden_layers: int = 24, + num_heads: int = 32, + num_kv_heads: int = None, + hidden_act: str = "swish", + window_size: Optional[int] = None, + max_position_embeddings: int = 2048, + initializer_range: float = 0.02, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + attention_bias: bool = False, + fuse_norm: bool = True, + fuse_cross_entropy: bool = True, + rope_base: float = 500000.0, + use_rope: bool = True, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.window_size = window_size + self.max_position_embeddings = max_position_embeddings + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.fuse_cross_entropy = fuse_cross_entropy + self.fuse_norm = fuse_norm + self.rope_base = rope_base + self.use_rope = use_rope + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/decay_params.txt b/decay_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b20f69dafc95cbdeef8abdec02b867a332be198 --- /dev/null +++ b/decay_params.txt @@ -0,0 +1,16 @@ +_forward_module.model.embeddings.weight +_forward_module.model.layers.0.attn.q_proj.weight +_forward_module.model.layers.0.attn.k_proj.weight +_forward_module.model.layers.0.attn.v_proj.weight +_forward_module.model.layers.0.attn.o_proj.weight +_forward_module.model.layers.0.attn.fgate_proj.weight +_forward_module.model.layers.0.mlp.gate_proj.weight +_forward_module.model.layers.0.mlp.down_proj.weight +_forward_module.model.layers.1.attn.q_proj.weight +_forward_module.model.layers.1.attn.k_proj.weight +_forward_module.model.layers.1.attn.v_proj.weight +_forward_module.model.layers.1.attn.o_proj.weight +_forward_module.model.layers.1.attn.fgate_proj.weight +_forward_module.model.layers.1.mlp.gate_proj.weight +_forward_module.model.layers.1.mlp.down_proj.weight +_forward_module.lm_head.weight diff --git a/logs/2025-10-13_04-19-28.log b/logs/2025-10-13_04-19-28.log new file mode 100644 index 0000000000000000000000000000000000000000..16ffa68c05c1a311ce027a354db1cabb55155855 --- /dev/null +++ b/logs/2025-10-13_04-19-28.log @@ -0,0 +1,258 @@ +[2025-10-13 04:19:29][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/forgetting_gate_2_4_256` +[2025-10-13 04:19:29][train:375][INFO] Configuration: +[2025-10-13 04:19:29][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/config.yaml. +[2025-10-13 04:19:29][train:387][INFO] creating datamodule +[2025-10-13 04:19:29][train:419][INFO] creating model +[2025-10-13 04:19:29][train:440][INFO] creating optimizer +[2025-10-13 04:19:29][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints... +[2025-10-13 04:19:29][logger:256][INFO] Setting up wandb logger... +[2025-10-13 04:19:29][logger:272][INFO] Not resuming. Creating a new wandb run. +[2025-10-13 04:19:30][logger:288][INFO] wandb initialized. Run id: 0m6wmz8p +[2025-10-13 04:19:30][logger:186][INFO] Setting up jsonlines logger... +[2025-10-13 04:19:30][logger:113][INFO] Setting up npz logger... +[2025-10-13 04:19:30][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024] +[2025-10-13 04:19:30][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1] +[2025-10-13 04:19:30][logger:171][INFO] [step: 0] [model_info/total_params: 27449096] [model_info/trainable_params: 27449096] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14578184] +[2025-10-13 04:20:12][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:41] [ETA: 1:09:15] [loss: 9.770] [tokens/s: 591486.463] [batches/s: 0.282] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:20:47][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:17] [ETA: 1:03:10] [loss: 8.079] [tokens/s: 592065.333] [batches/s: 0.282] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:20:47][train:194][INFO] Running validation... +[2025-10-13 04:22:03][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 77.367] [val/train_update_time: 76.988] [val/loss: 7.967] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.470] [val/val_tokens_per_second: 542734.371] [val/loss_avg_len_2048: 7.967] [val/perplexity_len_2048: 2885.363] [val/loss_avg_len_1024: 7.966] [val/perplexity_len_1024: 2880.986] [val/loss_avg_len_512: 7.966] [val/perplexity_len_512: 2881.145] +[2025-10-13 04:22:38][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:03:08] [ETA: 1:41:26] [loss: 7.445] [tokens/s: 341356.838] [batches/s: 0.163] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:23:13][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:03:43] [ETA: 1:29:26] [loss: 7.106] [tokens/s: 382999.413] [batches/s: 0.183] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:23:13][train:194][INFO] Running validation... +[2025-10-13 04:24:28][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 223.616] [val/train_update_time: 147.446] [val/loss: 7.078] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.524] [val/val_tokens_per_second: 549622.163] [val/loss_avg_len_2048: 7.078] [val/perplexity_len_2048: 1185.598] [val/loss_avg_len_1024: 7.077] [val/perplexity_len_1024: 1184.110] [val/loss_avg_len_512: 7.078] [val/perplexity_len_512: 1185.392] +[2025-10-13 04:25:03][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:05:33] [ETA: 1:45:36] [loss: 6.857] [tokens/s: 317696.511] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:25:03][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 333.522] [train_eval/train_update_time: 182.681] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.219] [train_eval/perplexity_len_2048: 3709.494] [train_eval/loss_avg_len_1024: 8.219] [train_eval/perplexity_len_1024: 3710.807] [train_eval/loss_avg_len_512: 8.219] [train_eval/perplexity_len_512: 3709.141] +[2025-10-13 04:25:39][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:06:08] [ETA: 1:36:19] [loss: 6.597] [tokens/s: 344814.904] [batches/s: 0.164] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:25:39][train:194][INFO] Running validation... +[2025-10-13 04:26:53][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 368.903] [val/train_update_time: 217.910] [val/loss: 6.593] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.573] [val/val_tokens_per_second: 549260.699] [val/loss_avg_len_2048: 6.593] [val/perplexity_len_2048: 730.224] [val/loss_avg_len_1024: 6.592] [val/perplexity_len_1024: 729.509] [val/loss_avg_len_512: 6.595] [val/perplexity_len_512: 731.117] +[2025-10-13 04:27:29][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:07:58] [ETA: 1:46:02] [loss: 6.423] [tokens/s: 308671.490] [batches/s: 0.147] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:28:04][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:08:34] [ETA: 1:38:33] [loss: 6.240] [tokens/s: 328605.049] [batches/s: 0.157] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:28:04][train:194][INFO] Running validation... +[2025-10-13 04:29:18][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 514.244] [val/train_update_time: 288.362] [val/loss: 6.212] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.470] [val/val_tokens_per_second: 550023.731] [val/loss_avg_len_2048: 6.212] [val/perplexity_len_2048: 498.778] [val/loss_avg_len_1024: 6.212] [val/perplexity_len_1024: 498.754] [val/loss_avg_len_512: 6.216] [val/perplexity_len_512: 500.573] +[2025-10-13 04:29:54][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:10:24] [ETA: 1:45:10] [loss: 6.051] [tokens/s: 303973.820] [batches/s: 0.145] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:30:29][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:10:59] [ETA: 1:38:55] [loss: 5.919] [tokens/s: 319710.612] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:30:29][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 659.461] [train_eval/train_update_time: 358.814] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.337] [train_eval/perplexity_len_2048: 565.198] [train_eval/loss_avg_len_1024: 6.339] [train_eval/perplexity_len_1024: 566.417] [train_eval/loss_avg_len_512: 6.342] [train_eval/perplexity_len_512: 567.789] +[2025-10-13 04:30:29][train:194][INFO] Running validation... +[2025-10-13 04:31:44][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 659.461] [val/train_update_time: 358.814] [val/loss: 5.906] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.661] [val/val_tokens_per_second: 548616.239] [val/loss_avg_len_2048: 5.906] [val/perplexity_len_2048: 367.361] [val/loss_avg_len_1024: 5.908] [val/perplexity_len_1024: 367.914] [val/loss_avg_len_512: 5.914] [val/perplexity_len_512: 370.216] +[2025-10-13 04:31:44][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000209715200.pt... +[2025-10-13 04:31:44][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000209715200.pt. +[2025-10-13 04:31:44][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.638] +[2025-10-13 04:32:20][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:12:50] [ETA: 1:43:51] [loss: 5.785] [tokens/s: 286519.111] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:32:55][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:13:25] [ETA: 1:38:27] [loss: 5.645] [tokens/s: 319846.018] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:32:55][train:194][INFO] Running validation... +[2025-10-13 04:34:10][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 805.506] [val/train_update_time: 429.270] [val/loss: 5.654] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.211] [val/val_tokens_per_second: 544602.455] [val/loss_avg_len_2048: 5.654] [val/perplexity_len_2048: 285.535] [val/loss_avg_len_1024: 5.657] [val/perplexity_len_1024: 286.418] [val/loss_avg_len_512: 5.666] [val/perplexity_len_512: 288.923] +[2025-10-13 04:34:46][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:15:16] [ETA: 1:42:10] [loss: 5.578] [tokens/s: 286634.761] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:35:21][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:15:51] [ETA: 1:37:24] [loss: 5.487] [tokens/s: 319513.522] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:35:21][train:194][INFO] Running validation... +[2025-10-13 04:36:36][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 951.478] [val/train_update_time: 499.734] [val/loss: 5.471] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.619] [val/val_tokens_per_second: 548918.360] [val/loss_avg_len_2048: 5.471] [val/perplexity_len_2048: 237.717] [val/loss_avg_len_1024: 5.476] [val/perplexity_len_1024: 238.770] [val/loss_avg_len_512: 5.486] [val/perplexity_len_512: 241.317] +[2025-10-13 04:37:11][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:17:41] [ETA: 1:40:15] [loss: 5.374] [tokens/s: 286597.035] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:37:11][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1061.491] [train_eval/train_update_time: 534.962] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.623] [train_eval/perplexity_len_2048: 276.745] [train_eval/loss_avg_len_1024: 5.627] [train_eval/perplexity_len_1024: 277.857] [train_eval/loss_avg_len_512: 5.635] [train_eval/perplexity_len_512: 280.049] +[2025-10-13 04:37:47][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:18:16] [ETA: 1:35:58] [loss: 5.322] [tokens/s: 319485.024] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:37:47][train:194][INFO] Running validation... +[2025-10-13 04:39:01][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1096.876] [val/train_update_time: 570.191] [val/loss: 5.306] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.726] [val/val_tokens_per_second: 548136.181] [val/loss_avg_len_2048: 5.306] [val/perplexity_len_2048: 201.613] [val/loss_avg_len_1024: 5.312] [val/perplexity_len_1024: 202.759] [val/loss_avg_len_512: 5.324] [val/perplexity_len_512: 205.275] +[2025-10-13 04:39:37][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:20:06] [ETA: 1:38:12] [loss: 5.217] [tokens/s: 286535.928] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:40:12][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:20:42] [ETA: 1:34:19] [loss: 5.194] [tokens/s: 319360.960] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:40:12][train:194][INFO] Running validation... +[2025-10-13 04:41:27][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1242.364] [val/train_update_time: 640.650] [val/loss: 5.185] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.069] [val/val_tokens_per_second: 545629.639] [val/loss_avg_len_2048: 5.185] [val/perplexity_len_2048: 178.495] [val/loss_avg_len_1024: 5.191] [val/perplexity_len_1024: 179.712] [val/loss_avg_len_512: 5.206] [val/perplexity_len_512: 182.304] +[2025-10-13 04:42:03][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:22:32] [ETA: 1:36:07] [loss: 5.164] [tokens/s: 286291.562] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:42:38][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:23:08] [ETA: 1:32:32] [loss: 5.069] [tokens/s: 319462.979] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:42:38][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1388.207] [train_eval/train_update_time: 711.111] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.218] [train_eval/perplexity_len_2048: 184.590] [train_eval/loss_avg_len_1024: 5.225] [train_eval/perplexity_len_1024: 185.920] [train_eval/loss_avg_len_512: 5.238] [train_eval/perplexity_len_512: 188.286] +[2025-10-13 04:42:38][train:194][INFO] Running validation... +[2025-10-13 04:43:52][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1388.207] [val/train_update_time: 711.111] [val/loss: 5.070] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.477] [val/val_tokens_per_second: 549967.363] [val/loss_avg_len_2048: 5.070] [val/perplexity_len_2048: 159.104] [val/loss_avg_len_1024: 5.078] [val/perplexity_len_1024: 160.392] [val/loss_avg_len_512: 5.094] [val/perplexity_len_512: 163.046] +[2025-10-13 04:43:52][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000419430400.pt... +[2025-10-13 04:43:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000419430400.pt. +[2025-10-13 04:43:53][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.627] +[2025-10-13 04:44:28][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:24:58] [ETA: 1:33:57] [loss: 5.027] [tokens/s: 286360.798] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:45:04][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:25:34] [ETA: 1:30:39] [loss: 4.985] [tokens/s: 319487.139] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:45:04][train:194][INFO] Running validation... +[2025-10-13 04:46:19][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 1534.112] [val/train_update_time: 781.583] [val/loss: 4.976] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.624] [val/val_tokens_per_second: 541624.450] [val/loss_avg_len_2048: 4.976] [val/perplexity_len_2048: 144.828] [val/loss_avg_len_1024: 4.985] [val/perplexity_len_1024: 146.184] [val/loss_avg_len_512: 5.003] [val/perplexity_len_512: 148.868] +[2025-10-13 04:46:55][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:27:25] [ETA: 1:31:47] [loss: 4.932] [tokens/s: 286176.366] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:47:30][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:28:00] [ETA: 1:28:41] [loss: 4.875] [tokens/s: 318991.880] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:47:30][train:194][INFO] Running validation... +[2025-10-13 04:48:47][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 1680.500] [val/train_update_time: 852.097] [val/loss: 4.895] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 77.033] [val/val_tokens_per_second: 531719.492] [val/loss_avg_len_2048: 4.895] [val/perplexity_len_2048: 133.679] [val/loss_avg_len_1024: 4.906] [val/perplexity_len_1024: 135.103] [val/loss_avg_len_512: 4.926] [val/perplexity_len_512: 137.869] +[2025-10-13 04:49:23][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:29:52] [ETA: 1:29:38] [loss: 4.870] [tokens/s: 285233.108] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:49:23][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1792.928] [train_eval/train_update_time: 887.347] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.962] [train_eval/perplexity_len_2048: 142.942] [train_eval/loss_avg_len_1024: 4.970] [train_eval/perplexity_len_1024: 144.043] [train_eval/loss_avg_len_512: 4.987] [train_eval/perplexity_len_512: 146.503] +[2025-10-13 04:49:58][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:30:28] [ETA: 1:26:43] [loss: 4.817] [tokens/s: 317860.781] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:49:58][train:194][INFO] Running validation... +[2025-10-13 04:51:15][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 1828.323] [val/train_update_time: 922.602] [val/loss: 4.820] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 76.827] [val/val_tokens_per_second: 533145.204] [val/loss_avg_len_2048: 4.820] [val/perplexity_len_2048: 123.911] [val/loss_avg_len_1024: 4.831] [val/perplexity_len_1024: 125.385] [val/loss_avg_len_512: 4.854] [val/perplexity_len_512: 128.238] +[2025-10-13 04:51:50][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:32:20] [ETA: 1:27:26] [loss: 4.806] [tokens/s: 284395.974] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:52:26][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:32:55] [ETA: 1:24:41] [loss: 4.739] [tokens/s: 316987.083] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:52:26][train:194][INFO] Running validation... +[2025-10-13 04:53:42][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 1975.955] [val/train_update_time: 993.123] [val/loss: 4.752] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 76.377] [val/val_tokens_per_second: 536286.449] [val/loss_avg_len_2048: 4.752] [val/perplexity_len_2048: 115.836] [val/loss_avg_len_1024: 4.766] [val/perplexity_len_1024: 117.399] [val/loss_avg_len_512: 4.791] [val/perplexity_len_512: 120.371] +[2025-10-13 04:54:17][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:34:47] [ETA: 1:25:11] [loss: 4.711] [tokens/s: 283879.059] [batches/s: 0.135] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:54:53][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:35:23] [ETA: 1:22:33] [loss: 4.675] [tokens/s: 316371.005] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:54:53][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2123.110] [train_eval/train_update_time: 1063.612] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.779] [train_eval/perplexity_len_2048: 118.961] [train_eval/loss_avg_len_1024: 4.789] [train_eval/perplexity_len_1024: 120.142] [train_eval/loss_avg_len_512: 4.811] [train_eval/perplexity_len_512: 122.795] +[2025-10-13 04:54:53][train:194][INFO] Running validation... +[2025-10-13 04:56:08][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 2123.110] [val/train_update_time: 1063.612] [val/loss: 4.689] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.720] [val/val_tokens_per_second: 548182.963] [val/loss_avg_len_2048: 4.689] [val/perplexity_len_2048: 108.789] [val/loss_avg_len_1024: 4.705] [val/perplexity_len_1024: 110.479] [val/loss_avg_len_512: 4.733] [val/perplexity_len_512: 113.677] +[2025-10-13 04:56:08][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000629145600.pt... +[2025-10-13 04:56:08][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000629145600.pt. +[2025-10-13 04:56:08][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.611] +[2025-10-13 04:56:44][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:37:13] [ETA: 1:22:52] [loss: 4.688] [tokens/s: 283793.798] [batches/s: 0.135] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:57:19][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:37:49] [ETA: 1:20:22] [loss: 4.614] [tokens/s: 316521.289] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:57:19][train:194][INFO] Running validation... +[2025-10-13 04:58:34][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 2269.221] [val/train_update_time: 1134.096] [val/loss: 4.627] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.625] [val/val_tokens_per_second: 548879.706] [val/loss_avg_len_2048: 4.627] [val/perplexity_len_2048: 102.171] [val/loss_avg_len_1024: 4.645] [val/perplexity_len_1024: 104.022] [val/loss_avg_len_512: 4.677] [val/perplexity_len_512: 107.467] +[2025-10-13 04:59:09][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:39:39] [ETA: 1:20:30] [loss: 4.624] [tokens/s: 284187.048] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:59:44][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:40:14] [ETA: 1:18:07] [loss: 4.571] [tokens/s: 317673.799] [batches/s: 0.151] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 04:59:44][train:194][INFO] Running validation... +[2025-10-13 05:01:00][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 2414.640] [val/train_update_time: 1204.582] [val/loss: 4.561] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.265] [val/val_tokens_per_second: 544207.657] [val/loss_avg_len_2048: 4.561] [val/perplexity_len_2048: 95.712] [val/loss_avg_len_1024: 4.583] [val/perplexity_len_1024: 97.802] [val/loss_avg_len_512: 4.621] [val/perplexity_len_512: 101.627] +[2025-10-13 05:01:35][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:42:05] [ETA: 1:18:09] [loss: 4.534] [tokens/s: 284873.417] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:01:35][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2525.280] [train_eval/train_update_time: 1239.831] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.612] [train_eval/perplexity_len_2048: 100.698] [train_eval/loss_avg_len_1024: 4.630] [train_eval/perplexity_len_1024: 102.494] [train_eval/loss_avg_len_512: 4.663] [train_eval/perplexity_len_512: 105.901] +[2025-10-13 05:02:10][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 0:42:40] [ETA: 1:15:52] [loss: 4.454] [tokens/s: 318435.270] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:02:10][train:194][INFO] Running validation... +[2025-10-13 05:03:25][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 2560.697] [val/train_update_time: 1275.092] [val/loss: 4.491] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.286] [val/val_tokens_per_second: 551381.465] [val/loss_avg_len_2048: 4.491] [val/perplexity_len_2048: 89.249] [val/loss_avg_len_1024: 4.518] [val/perplexity_len_1024: 91.636] [val/loss_avg_len_512: 4.563] [val/perplexity_len_512: 95.851] +[2025-10-13 05:04:00][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 0:44:30] [ETA: 1:15:46] [loss: 4.472] [tokens/s: 285864.892] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:04:35][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 0:45:05] [ETA: 1:13:34] [loss: 4.423] [tokens/s: 319466.094] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:04:35][train:194][INFO] Running validation... +[2025-10-13 05:05:50][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 2705.771] [val/train_update_time: 1345.541] [val/loss: 4.431] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.472] [val/val_tokens_per_second: 550003.693] [val/loss_avg_len_2048: 4.431] [val/perplexity_len_2048: 84.013] [val/loss_avg_len_1024: 4.462] [val/perplexity_len_1024: 86.673] [val/loss_avg_len_512: 4.513] [val/perplexity_len_512: 91.239] +[2025-10-13 05:06:25][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 0:46:55] [ETA: 1:13:23] [loss: 4.414] [tokens/s: 286621.674] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:07:01][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 0:47:31] [ETA: 1:11:16] [loss: 4.290] [tokens/s: 319887.675] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:07:01][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2851.025] [train_eval/train_update_time: 1415.981] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.448] [train_eval/perplexity_len_2048: 85.459] [train_eval/loss_avg_len_1024: 4.474] [train_eval/perplexity_len_1024: 87.732] [train_eval/loss_avg_len_512: 4.523] [train_eval/perplexity_len_512: 92.139] +[2025-10-13 05:07:01][train:194][INFO] Running validation... +[2025-10-13 05:08:15][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 2851.025] [val/train_update_time: 1415.981] [val/loss: 4.366] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.159] [val/val_tokens_per_second: 552326.836] [val/loss_avg_len_2048: 4.366] [val/perplexity_len_2048: 78.753] [val/loss_avg_len_1024: 4.403] [val/perplexity_len_1024: 81.672] [val/loss_avg_len_512: 4.461] [val/perplexity_len_512: 86.590] +[2025-10-13 05:08:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000838860800.pt... +[2025-10-13 05:08:16][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000000838860800.pt. +[2025-10-13 05:08:16][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.666] +[2025-10-13 05:08:51][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 0:49:21] [ETA: 1:11:01] [loss: 4.317] [tokens/s: 286820.189] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:09:26][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 0:49:56] [ETA: 1:08:58] [loss: 4.294] [tokens/s: 319788.327] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:09:26][train:194][INFO] Running validation... +[2025-10-13 05:10:42][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 2996.631] [val/train_update_time: 1486.429] [val/loss: 4.308] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.161] [val/val_tokens_per_second: 544965.835] [val/loss_avg_len_2048: 4.308] [val/perplexity_len_2048: 74.259] [val/loss_avg_len_1024: 4.348] [val/perplexity_len_1024: 77.353] [val/loss_avg_len_512: 4.412] [val/perplexity_len_512: 82.458] +[2025-10-13 05:11:17][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 0:51:47] [ETA: 1:08:38] [loss: 4.289] [tokens/s: 286605.159] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:11:52][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 0:52:22] [ETA: 1:06:39] [loss: 4.305] [tokens/s: 319833.539] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:11:52][train:194][INFO] Running validation... +[2025-10-13 05:13:07][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 3142.596] [val/train_update_time: 1556.887] [val/loss: 4.261] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.317] [val/val_tokens_per_second: 551150.681] [val/loss_avg_len_2048: 4.261] [val/perplexity_len_2048: 70.871] [val/loss_avg_len_1024: 4.305] [val/perplexity_len_1024: 74.083] [val/loss_avg_len_512: 4.374] [val/perplexity_len_512: 79.348] +[2025-10-13 05:13:42][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 0:54:12] [ETA: 1:06:15] [loss: 4.239] [tokens/s: 286965.718] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:13:42][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3252.316] [train_eval/train_update_time: 1592.112] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.299] [train_eval/perplexity_len_2048: 73.599] [train_eval/loss_avg_len_1024: 4.339] [train_eval/perplexity_len_1024: 76.608] [train_eval/loss_avg_len_512: 4.404] [train_eval/perplexity_len_512: 81.781] +[2025-10-13 05:14:17][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 0:54:47] [ETA: 1:04:19] [loss: 4.203] [tokens/s: 319821.510] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:14:17][train:194][INFO] Running validation... +[2025-10-13 05:15:32][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 3287.701] [val/train_update_time: 1627.339] [val/loss: 4.223] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.425] [val/val_tokens_per_second: 550355.646] [val/loss_avg_len_2048: 4.223] [val/perplexity_len_2048: 68.256] [val/loss_avg_len_1024: 4.271] [val/perplexity_len_1024: 71.620] [val/loss_avg_len_512: 4.344] [val/perplexity_len_512: 76.978] +[2025-10-13 05:16:07][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 0:56:37] [ETA: 1:03:51] [loss: 4.192] [tokens/s: 286927.909] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:16:43][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 0:57:12] [ETA: 1:01:58] [loss: 4.181] [tokens/s: 319855.031] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:16:43][train:194][INFO] Running validation... +[2025-10-13 05:17:57][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 3432.890] [val/train_update_time: 1697.783] [val/loss: 4.185] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.375] [val/val_tokens_per_second: 550725.509] [val/loss_avg_len_2048: 4.185] [val/perplexity_len_2048: 65.697] [val/loss_avg_len_1024: 4.235] [val/perplexity_len_1024: 69.081] [val/loss_avg_len_512: 4.310] [val/perplexity_len_512: 74.471] +[2025-10-13 05:18:32][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 0:59:02] [ETA: 1:01:27] [loss: 4.173] [tokens/s: 286974.547] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:19:08][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 0:59:38] [ETA: 0:59:38] [loss: 4.142] [tokens/s: 320079.252] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:19:08][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3578.041] [train_eval/train_update_time: 1768.242] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.191] [train_eval/perplexity_len_2048: 66.116] [train_eval/loss_avg_len_1024: 4.237] [train_eval/perplexity_len_1024: 69.179] [train_eval/loss_avg_len_512: 4.311] [train_eval/perplexity_len_512: 74.482] +[2025-10-13 05:19:08][train:194][INFO] Running validation... +[2025-10-13 05:20:22][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 3578.041] [val/train_update_time: 1768.242] [val/loss: 4.149] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.343] [val/val_tokens_per_second: 550959.342] [val/loss_avg_len_2048: 4.149] [val/perplexity_len_2048: 63.383] [val/loss_avg_len_1024: 4.201] [val/perplexity_len_1024: 66.786] [val/loss_avg_len_512: 4.279] [val/perplexity_len_512: 72.190] +[2025-10-13 05:20:22][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001048576000.pt... +[2025-10-13 05:20:23][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001048576000.pt. +[2025-10-13 05:20:23][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.766] +[2025-10-13 05:20:58][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:01:28] [ETA: 0:59:03] [loss: 4.121] [tokens/s: 286864.354] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:21:34][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:02:03] [ETA: 0:57:17] [loss: 4.120] [tokens/s: 320109.733] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:21:34][train:194][INFO] Running validation... +[2025-10-13 05:22:48][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 3723.923] [val/train_update_time: 1838.693] [val/loss: 4.129] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.465] [val/val_tokens_per_second: 550059.590] [val/loss_avg_len_2048: 4.129] [val/perplexity_len_2048: 62.096] [val/loss_avg_len_1024: 4.183] [val/perplexity_len_1024: 65.589] [val/loss_avg_len_512: 4.264] [val/perplexity_len_512: 71.120] +[2025-10-13 05:23:24][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:03:53] [ETA: 0:56:39] [loss: 4.105] [tokens/s: 287144.886] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:23:59][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:04:29] [ETA: 0:54:55] [loss: 4.105] [tokens/s: 320047.173] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:23:59][train:194][INFO] Running validation... +[2025-10-13 05:25:13][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 3869.169] [val/train_update_time: 1909.147] [val/loss: 4.098] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.465] [val/val_tokens_per_second: 550059.656] [val/loss_avg_len_2048: 4.098] [val/perplexity_len_2048: 60.230] [val/loss_avg_len_1024: 4.154] [val/perplexity_len_1024: 63.666] [val/loss_avg_len_512: 4.235] [val/perplexity_len_512: 69.090] +[2025-10-13 05:25:49][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:06:19] [ETA: 0:54:15] [loss: 4.049] [tokens/s: 287099.609] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:25:49][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3979.018] [train_eval/train_update_time: 1944.378] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.112] [train_eval/perplexity_len_2048: 61.077] [train_eval/loss_avg_len_1024: 4.162] [train_eval/perplexity_len_1024: 64.171] [train_eval/loss_avg_len_512: 4.239] [train_eval/perplexity_len_512: 69.355] +[2025-10-13 05:26:24][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:06:54] [ETA: 0:52:34] [loss: 4.094] [tokens/s: 320036.846] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:26:24][train:194][INFO] Running validation... +[2025-10-13 05:27:39][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 4014.402] [val/train_update_time: 1979.612] [val/loss: 4.073] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.795] [val/val_tokens_per_second: 547632.517] [val/loss_avg_len_2048: 4.073] [val/perplexity_len_2048: 58.717] [val/loss_avg_len_1024: 4.129] [val/perplexity_len_1024: 62.139] [val/loss_avg_len_512: 4.212] [val/perplexity_len_512: 67.491] +[2025-10-13 05:28:14][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:08:44] [ETA: 0:51:51] [loss: 4.016] [tokens/s: 286946.236] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:28:50][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:09:19] [ETA: 0:50:12] [loss: 4.078] [tokens/s: 319822.627] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:28:50][train:194][INFO] Running validation... +[2025-10-13 05:30:04][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 4159.974] [val/train_update_time: 2050.064] [val/loss: 4.054] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.715] [val/val_tokens_per_second: 548218.992] [val/loss_avg_len_2048: 4.054] [val/perplexity_len_2048: 57.623] [val/loss_avg_len_1024: 4.111] [val/perplexity_len_1024: 61.029] [val/loss_avg_len_512: 4.195] [val/perplexity_len_512: 66.360] +[2025-10-13 05:30:40][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:11:10] [ETA: 0:49:27] [loss: 4.082] [tokens/s: 286811.315] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:31:15][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:11:45] [ETA: 0:47:50] [loss: 4.078] [tokens/s: 320020.010] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:31:15][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4305.465] [train_eval/train_update_time: 2120.524] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.053] [train_eval/perplexity_len_2048: 57.598] [train_eval/loss_avg_len_1024: 4.104] [train_eval/perplexity_len_1024: 60.585] [train_eval/loss_avg_len_512: 4.187] [train_eval/perplexity_len_512: 65.821] +[2025-10-13 05:31:15][train:194][INFO] Running validation... +[2025-10-13 05:32:30][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 4305.465] [val/train_update_time: 2120.524] [val/loss: 4.036] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.652] [val/val_tokens_per_second: 548676.153] [val/loss_avg_len_2048: 4.036] [val/perplexity_len_2048: 56.618] [val/loss_avg_len_1024: 4.095] [val/perplexity_len_1024: 60.022] [val/loss_avg_len_512: 4.180] [val/perplexity_len_512: 65.344] +[2025-10-13 05:32:30][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001258291200.pt... +[2025-10-13 05:32:31][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001258291200.pt. +[2025-10-13 05:32:31][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.753] +[2025-10-13 05:33:06][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:13:36] [ETA: 0:47:03] [loss: 4.035] [tokens/s: 286692.593] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:33:41][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:14:11] [ETA: 0:45:28] [loss: 4.013] [tokens/s: 319554.068] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:33:41][train:194][INFO] Running validation... +[2025-10-13 05:34:56][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 4451.645] [val/train_update_time: 2190.979] [val/loss: 4.019] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.978] [val/val_tokens_per_second: 546294.032] [val/loss_avg_len_2048: 4.019] [val/perplexity_len_2048: 55.629] [val/loss_avg_len_1024: 4.078] [val/perplexity_len_1024: 59.009] [val/loss_avg_len_512: 4.163] [val/perplexity_len_512: 64.282] +[2025-10-13 05:35:32][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:16:02] [ETA: 0:44:39] [loss: 4.033] [tokens/s: 286491.488] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:36:07][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:16:37] [ETA: 0:43:06] [loss: 4.017] [tokens/s: 319301.824] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:36:07][train:194][INFO] Running validation... +[2025-10-13 05:37:22][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 4597.406] [val/train_update_time: 2261.444] [val/loss: 4.007] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.170] [val/val_tokens_per_second: 544901.226] [val/loss_avg_len_2048: 4.007] [val/perplexity_len_2048: 54.968] [val/loss_avg_len_1024: 4.068] [val/perplexity_len_1024: 58.421] [val/loss_avg_len_512: 4.155] [val/perplexity_len_512: 63.732] +[2025-10-13 05:37:58][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:18:27] [ETA: 0:42:15] [loss: 3.986] [tokens/s: 286210.855] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:37:58][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4707.963] [train_eval/train_update_time: 2296.670] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.014] [train_eval/perplexity_len_2048: 55.371] [train_eval/loss_avg_len_1024: 4.071] [train_eval/perplexity_len_1024: 58.641] [train_eval/loss_avg_len_512: 4.157] [train_eval/perplexity_len_512: 63.850] +[2025-10-13 05:38:33][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:19:03] [ETA: 0:40:43] [loss: 4.021] [tokens/s: 319110.520] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:38:33][train:194][INFO] Running validation... +[2025-10-13 05:39:48][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 4743.360] [val/train_update_time: 2331.894] [val/loss: 3.992] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.694] [val/val_tokens_per_second: 548367.491] [val/loss_avg_len_2048: 3.992] [val/perplexity_len_2048: 54.190] [val/loss_avg_len_1024: 4.053] [val/perplexity_len_1024: 57.572] [val/loss_avg_len_512: 4.140] [val/perplexity_len_512: 62.823] +[2025-10-13 05:40:23][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:20:53] [ETA: 0:39:50] [loss: 3.965] [tokens/s: 286250.134] [batches/s: 0.136] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:40:59][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:21:28] [ETA: 0:38:20] [loss: 3.980] [tokens/s: 319120.496] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:40:59][train:194][INFO] Running validation... +[2025-10-13 05:42:13][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 4888.829] [val/train_update_time: 2402.345] [val/loss: 3.982] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.634] [val/val_tokens_per_second: 548810.253] [val/loss_avg_len_2048: 3.982] [val/perplexity_len_2048: 53.631] [val/loss_avg_len_1024: 4.043] [val/perplexity_len_1024: 56.991] [val/loss_avg_len_512: 4.130] [val/perplexity_len_512: 62.207] +[2025-10-13 05:42:49][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:23:18] [ETA: 0:37:25] [loss: 3.989] [tokens/s: 286277.791] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:43:24][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:23:54] [ETA: 0:35:57] [loss: 3.984] [tokens/s: 319504.392] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:43:24][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5034.234] [train_eval/train_update_time: 2472.807] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.982] [train_eval/perplexity_len_2048: 53.604] [train_eval/loss_avg_len_1024: 4.040] [train_eval/perplexity_len_1024: 56.830] [train_eval/loss_avg_len_512: 4.128] [train_eval/perplexity_len_512: 62.023] +[2025-10-13 05:43:24][train:194][INFO] Running validation... +[2025-10-13 05:44:38][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 5034.234] [val/train_update_time: 2472.807] [val/loss: 3.972] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.000] [val/val_tokens_per_second: 553510.662] [val/loss_avg_len_2048: 3.972] [val/perplexity_len_2048: 53.094] [val/loss_avg_len_1024: 4.034] [val/perplexity_len_1024: 56.502] [val/loss_avg_len_512: 4.123] [val/perplexity_len_512: 61.738] +[2025-10-13 05:44:38][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001468006400.pt... +[2025-10-13 05:44:39][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001468006400.pt. +[2025-10-13 05:44:39][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.740] +[2025-10-13 05:45:14][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 1:25:44] [ETA: 0:35:01] [loss: 3.991] [tokens/s: 286543.113] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:45:49][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 1:26:19] [ETA: 0:33:34] [loss: 3.960] [tokens/s: 319622.722] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:45:49][train:194][INFO] Running validation... +[2025-10-13 05:47:04][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 5179.742] [val/train_update_time: 2543.273] [val/loss: 3.963] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.037] [val/val_tokens_per_second: 553237.635] [val/loss_avg_len_2048: 3.963] [val/perplexity_len_2048: 52.619] [val/loss_avg_len_1024: 4.025] [val/perplexity_len_1024: 55.989] [val/loss_avg_len_512: 4.114] [val/perplexity_len_512: 61.183] +[2025-10-13 05:47:39][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 1:28:09] [ETA: 0:32:36] [loss: 3.984] [tokens/s: 286928.205] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:48:14][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 1:28:44] [ETA: 0:31:10] [loss: 3.956] [tokens/s: 320199.551] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:48:14][train:194][INFO] Running validation... +[2025-10-13 05:49:29][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 5324.525] [val/train_update_time: 2613.729] [val/loss: 3.956] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.740] [val/val_tokens_per_second: 548032.886] [val/loss_avg_len_2048: 3.956] [val/perplexity_len_2048: 52.244] [val/loss_avg_len_1024: 4.018] [val/perplexity_len_1024: 55.585] [val/loss_avg_len_512: 4.107] [val/perplexity_len_512: 60.747] +[2025-10-13 05:50:04][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 1:30:34] [ETA: 0:30:11] [loss: 3.952] [tokens/s: 287106.576] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:50:04][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5434.649] [train_eval/train_update_time: 2648.955] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.958] [train_eval/perplexity_len_2048: 52.350] [train_eval/loss_avg_len_1024: 4.017] [train_eval/perplexity_len_1024: 55.539] [train_eval/loss_avg_len_512: 4.106] [train_eval/perplexity_len_512: 60.687] +[2025-10-13 05:50:40][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 1:31:10] [ETA: 0:28:47] [loss: 3.908] [tokens/s: 320185.929] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:50:40][train:194][INFO] Running validation... +[2025-10-13 05:51:54][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 5470.030] [val/train_update_time: 2684.182] [val/loss: 3.948] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.592] [val/val_tokens_per_second: 549120.775] [val/loss_avg_len_2048: 3.948] [val/perplexity_len_2048: 51.845] [val/loss_avg_len_1024: 4.011] [val/perplexity_len_1024: 55.196] [val/loss_avg_len_512: 4.100] [val/perplexity_len_512: 60.359] +[2025-10-13 05:52:30][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 1:32:59] [ETA: 0:27:46] [loss: 3.980] [tokens/s: 287156.125] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:53:05][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 1:33:35] [ETA: 0:26:23] [loss: 3.917] [tokens/s: 320216.572] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:53:05][train:194][INFO] Running validation... +[2025-10-13 05:54:19][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 5615.375] [val/train_update_time: 2754.645] [val/loss: 3.942] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.136] [val/val_tokens_per_second: 552495.297] [val/loss_avg_len_2048: 3.942] [val/perplexity_len_2048: 51.530] [val/loss_avg_len_1024: 4.005] [val/perplexity_len_1024: 54.892] [val/loss_avg_len_512: 4.095] [val/perplexity_len_512: 60.056] +[2025-10-13 05:54:55][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 1:35:24] [ETA: 0:25:21] [loss: 3.934] [tokens/s: 287362.107] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:55:30][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 1:36:00] [ETA: 0:24:00] [loss: 3.919] [tokens/s: 320523.542] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:55:30][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5760.270] [train_eval/train_update_time: 2825.100] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.941] [train_eval/perplexity_len_2048: 51.489] [train_eval/loss_avg_len_1024: 4.003] [train_eval/perplexity_len_1024: 54.748] [train_eval/loss_avg_len_512: 4.092] [train_eval/perplexity_len_512: 59.830] +[2025-10-13 05:55:30][train:194][INFO] Running validation... +[2025-10-13 05:56:44][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 5760.270] [val/train_update_time: 2825.100] [val/loss: 3.937] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.038] [val/val_tokens_per_second: 553225.990] [val/loss_avg_len_2048: 3.937] [val/perplexity_len_2048: 51.272] [val/loss_avg_len_1024: 4.001] [val/perplexity_len_1024: 54.634] [val/loss_avg_len_512: 4.091] [val/perplexity_len_512: 59.784] +[2025-10-13 05:56:44][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001677721600.pt... +[2025-10-13 05:56:45][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001677721600.pt. +[2025-10-13 05:56:45][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.735] +[2025-10-13 05:57:20][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 1:37:50] [ETA: 0:22:57] [loss: 3.897] [tokens/s: 287360.370] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:57:56][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 1:38:25] [ETA: 0:21:36] [loss: 3.908] [tokens/s: 320169.473] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 05:57:56][train:194][INFO] Running validation... +[2025-10-13 05:59:09][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 5905.784] [val/train_update_time: 2895.551] [val/loss: 3.933] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 73.876] [val/val_tokens_per_second: 554444.243] [val/loss_avg_len_2048: 3.933] [val/perplexity_len_2048: 51.049] [val/loss_avg_len_1024: 3.996] [val/perplexity_len_1024: 54.401] [val/loss_avg_len_512: 4.087] [val/perplexity_len_512: 59.543] +[2025-10-13 05:59:45][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 1:40:15] [ETA: 0:20:31] [loss: 3.945] [tokens/s: 287418.718] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:00:20][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 1:40:50] [ETA: 0:19:12] [loss: 3.879] [tokens/s: 320586.351] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:00:20][train:194][INFO] Running validation... +[2025-10-13 06:01:34][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 6050.427] [val/train_update_time: 2966.012] [val/loss: 3.929] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 73.964] [val/val_tokens_per_second: 553782.382] [val/loss_avg_len_2048: 3.929] [val/perplexity_len_2048: 50.862] [val/loss_avg_len_1024: 3.993] [val/perplexity_len_1024: 54.210] [val/loss_avg_len_512: 4.083] [val/perplexity_len_512: 59.342] +[2025-10-13 06:02:09][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 1:42:39] [ETA: 0:18:07] [loss: 3.958] [tokens/s: 287733.029] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:02:09][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6159.759] [train_eval/train_update_time: 3001.234] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.922] [train_eval/perplexity_len_2048: 50.512] [train_eval/loss_avg_len_1024: 3.977] [train_eval/perplexity_len_1024: 53.381] [train_eval/loss_avg_len_512: 4.067] [train_eval/perplexity_len_512: 58.378] +[2025-10-13 06:02:45][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 1:43:15] [ETA: 0:16:48] [loss: 3.943] [tokens/s: 320910.573] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:02:45][train:194][INFO] Running validation... +[2025-10-13 06:03:59][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 6195.132] [val/train_update_time: 3036.460] [val/loss: 3.926] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.036] [val/val_tokens_per_second: 553245.430] [val/loss_avg_len_2048: 3.926] [val/perplexity_len_2048: 50.709] [val/loss_avg_len_1024: 3.990] [val/perplexity_len_1024: 54.070] [val/loss_avg_len_512: 4.081] [val/perplexity_len_512: 59.202] +[2025-10-13 06:04:34][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 1:45:04] [ETA: 0:15:42] [loss: 3.885] [tokens/s: 287953.382] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:05:10][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 1:45:39] [ETA: 0:14:24] [loss: 3.896] [tokens/s: 320956.418] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:05:10][train:194][INFO] Running validation... +[2025-10-13 06:06:24][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 6339.925] [val/train_update_time: 3106.918] [val/loss: 3.924] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.078] [val/val_tokens_per_second: 552927.769] [val/loss_avg_len_2048: 3.924] [val/perplexity_len_2048: 50.581] [val/loss_avg_len_1024: 3.988] [val/perplexity_len_1024: 53.922] [val/loss_avg_len_512: 4.078] [val/perplexity_len_512: 59.038] +[2025-10-13 06:06:59][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 1:47:29] [ETA: 0:13:17] [loss: 3.967] [tokens/s: 287977.586] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:07:34][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 1:48:04] [ETA: 0:12:00] [loss: 3.874] [tokens/s: 321311.382] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:07:34][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6484.748] [train_eval/train_update_time: 3177.374] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.920] [train_eval/perplexity_len_2048: 50.392] [train_eval/loss_avg_len_1024: 3.978] [train_eval/perplexity_len_1024: 53.434] [train_eval/loss_avg_len_512: 4.068] [train_eval/perplexity_len_512: 58.462] +[2025-10-13 06:07:34][train:194][INFO] Running validation... +[2025-10-13 06:08:49][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 6484.748] [val/train_update_time: 3177.374] [val/loss: 3.922] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.161] [val/val_tokens_per_second: 552311.224] [val/loss_avg_len_2048: 3.922] [val/perplexity_len_2048: 50.491] [val/loss_avg_len_1024: 3.986] [val/perplexity_len_1024: 53.833] [val/loss_avg_len_512: 4.077] [val/perplexity_len_512: 58.949] +[2025-10-13 06:08:49][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001887436800.pt... +[2025-10-13 06:08:49][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_2_4_256/checkpoints/step-000001887436800.pt. +[2025-10-13 06:08:49][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.732] +[2025-10-13 06:09:25][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 1:49:55] [ETA: 0:10:52] [loss: 3.919] [tokens/s: 287928.669] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:10:00][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 1:50:30] [ETA: 0:09:36] [loss: 3.935] [tokens/s: 320793.117] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:10:00][train:194][INFO] Running validation... +[2025-10-13 06:11:14][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 6630.406] [val/train_update_time: 3247.843] [val/loss: 3.920] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.370] [val/val_tokens_per_second: 550762.741] [val/loss_avg_len_2048: 3.920] [val/perplexity_len_2048: 50.421] [val/loss_avg_len_1024: 3.985] [val/perplexity_len_1024: 53.758] [val/loss_avg_len_512: 4.075] [val/perplexity_len_512: 58.867] +[2025-10-13 06:11:50][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 1:52:20] [ETA: 0:08:27] [loss: 3.925] [tokens/s: 287726.244] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:12:25][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 1:52:55] [ETA: 0:07:12] [loss: 3.888] [tokens/s: 320591.534] [batches/s: 0.153] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:12:25][train:194][INFO] Running validation... +[2025-10-13 06:13:41][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 6775.544] [val/train_update_time: 3318.317] [val/loss: 3.920] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 75.727] [val/val_tokens_per_second: 540893.042] [val/loss_avg_len_2048: 3.920] [val/perplexity_len_2048: 50.377] [val/loss_avg_len_1024: 3.984] [val/perplexity_len_1024: 53.718] [val/loss_avg_len_512: 4.075] [val/perplexity_len_512: 58.828] +[2025-10-13 06:14:16][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 1:54:46] [ETA: 0:06:02] [loss: 3.896] [tokens/s: 287022.299] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:14:16][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6886.654] [train_eval/train_update_time: 3353.546] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.909] [train_eval/perplexity_len_2048: 49.853] [train_eval/loss_avg_len_1024: 3.971] [train_eval/perplexity_len_1024: 53.030] [train_eval/loss_avg_len_512: 4.060] [train_eval/perplexity_len_512: 57.986] +[2025-10-13 06:14:52][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 1:55:22] [ETA: 0:04:48] [loss: 3.897] [tokens/s: 319745.942] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:14:52][train:194][INFO] Running validation... +[2025-10-13 06:16:07][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 6922.034] [val/train_update_time: 3388.776] [val/loss: 3.919] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 74.748] [val/val_tokens_per_second: 547974.082] [val/loss_avg_len_2048: 3.919] [val/perplexity_len_2048: 50.352] [val/loss_avg_len_1024: 3.983] [val/perplexity_len_1024: 53.689] [val/loss_avg_len_512: 4.074] [val/perplexity_len_512: 58.794] +[2025-10-13 06:16:42][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 1:57:12] [ETA: 0:03:37] [loss: 3.929] [tokens/s: 286734.887] [batches/s: 0.137] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:17:17][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 1:57:47] [ETA: 0:02:24] [loss: 3.899] [tokens/s: 319413.416] [batches/s: 0.152] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-13 06:17:17][train:194][INFO] Running validation... +[2025-10-13 06:18:31][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 7067.550] [val/train_update_time: 3459.240] [val/loss: 3.919] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 73.896] [val/val_tokens_per_second: 554295.052] [val/loss_avg_len_2048: 3.919] [val/perplexity_len_2048: 50.339] [val/loss_avg_len_1024: 3.983] [val/perplexity_len_1024: 53.676] [val/loss_avg_len_512: 4.074] [val/perplexity_len_512: 58.781] +[2025-10-13 06:18:31][train:854][INFO] Training finished with 2055208960 tokens! diff --git a/metrics/jsonlines/checkpoint.jsonl b/metrics/jsonlines/checkpoint.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3779ac893a6028bd023be394a93182ee0b458cbc --- /dev/null +++ b/metrics/jsonlines/checkpoint.jsonl @@ -0,0 +1,9 @@ +{"step": 209715200, "checkpoint/checkpoint_time": 0.6384434180217795} +{"step": 419430400, "checkpoint/checkpoint_time": 0.626950032019522} +{"step": 629145600, "checkpoint/checkpoint_time": 0.6112130110268481} +{"step": 838860800, "checkpoint/checkpoint_time": 0.6657388239982538} +{"step": 1048576000, "checkpoint/checkpoint_time": 0.7660488990368322} +{"step": 1258291200, "checkpoint/checkpoint_time": 0.753050519968383} +{"step": 1468006400, "checkpoint/checkpoint_time": 0.7402905119815841} +{"step": 1677721600, "checkpoint/checkpoint_time": 0.7347163640079089} +{"step": 1887436800, "checkpoint/checkpoint_time": 0.7318841179949231} diff --git a/metrics/jsonlines/model_info.jsonl b/metrics/jsonlines/model_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..58a284393c3634b0149e1889d2b22f9901e599e1 --- /dev/null +++ b/metrics/jsonlines/model_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "model_info/total_params": 27449096, "model_info/trainable_params": 27449096, "model_info/embedding_params": 12870912, "model_info/flops_per_token": 0, "model_info/non_embedding_params": 14578184} diff --git a/metrics/jsonlines/norm.jsonl b/metrics/jsonlines/norm.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8a27c2fde932ffb2d60f09c2f29ddcede87b0f43 --- /dev/null +++ b/metrics/jsonlines/norm.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "pnorm/_forward_module.model.embeddings.weight": 72.05237579345703, "gnorm/_forward_module.model.embeddings.weight": 0.06038488447666168, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.969755172729492, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001855711336247623, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 5.1563520431518555, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.003524532774463296, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 5.182712554931641, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.0035395559389144182, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.136830806732178, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04442397132515907, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.12950325012207, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.04780397564172745, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.6290377974510193, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.007245807442814112, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.006396627519279718, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003113751532509923, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.037145614624023, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0009169657132588327, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 12.70312213897705, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0305657796561718, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 8.957891464233398, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.029606034979224205, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.993705749511719, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.001568031613714993, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.175420761108398, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.003415534505620599, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.1748738288879395, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.003269094740971923, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.174013614654541, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.05101334676146507, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.178831577301025, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10206922888755798, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.6649390459060669, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.021807961165905, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.006190520711243153, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0018073201645165682, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.034954071044922, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0014780916972085834, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 12.680872917175293, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0415254607796669, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 8.979910850524902, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04571067541837692, "pnorm/_forward_module.model.norm.weight": 16.01601791381836, "gnorm/_forward_module.model.norm.weight": 0.0625680461525917, "pnorm/_forward_module.lm_head.weight": 71.93384552001953, "gnorm/_forward_module.lm_head.weight": 0.9843456149101257} +{"step": 41943040, "pnorm/_forward_module.model.embeddings.weight": 73.49058532714844, "gnorm/_forward_module.model.embeddings.weight": 0.061607781797647476, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.944855690002441, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0018161119660362601, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 5.433432102203369, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.003969126846641302, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 5.461900234222412, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.003255331888794899, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.286075115203857, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.03514140844345093, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.279037952423096, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.036937784403562546, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.6584036946296692, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.007936905138194561, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00843839906156063, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0024757529608905315, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.098344802856445, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0011970505584031343, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 13.293915748596191, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03391759470105171, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 9.34956169128418, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02449009194970131, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.989734649658203, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.001276906463317573, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.448599338531494, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0015077836578711867, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.4395928382873535, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0016058466862887144, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.3501667976379395, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.021339863538742065, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.355653285980225, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.036692313849925995, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.6772328019142151, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0033541680313646793, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.009584035724401474, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00019561999943107367, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.04805564880371, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001032502157613635, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.12743854522705, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.01960146054625511, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.297164916992188, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.023524856194853783, "pnorm/_forward_module.model.norm.weight": 16.15458869934082, "gnorm/_forward_module.model.norm.weight": 0.07690682262182236, "pnorm/_forward_module.lm_head.weight": 80.11444091796875, "gnorm/_forward_module.lm_head.weight": 0.9487274289131165} +{"step": 62914560, "pnorm/_forward_module.model.embeddings.weight": 74.88994598388672, "gnorm/_forward_module.model.embeddings.weight": 0.07597663253545761, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.972419738769531, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004672551527619362, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 5.755270957946777, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.0049467491917312145, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 5.780837535858154, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.004270249977707863, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.457310676574707, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10007039457559586, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.443675994873047, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.14211130142211914, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.7366576790809631, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.009451810270547867, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.006736272014677525, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001875121844932437, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.118322372436523, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.008479232899844646, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 13.626164436340332, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2150479406118393, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 9.601815223693848, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.14681793749332428, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.972100257873535, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008661371655762196, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.617506980895996, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005076989531517029, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.60673189163208, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0074254898354411125, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.43977165222168, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1138111799955368, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.455945014953613, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09058065712451935, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.7070551514625549, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008187072351574898, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.009798632003366947, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00047067314153537154, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.024097442626953, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004171210806816816, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.378179550170898, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1020917072892189, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.497102737426758, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.061179760843515396, "pnorm/_forward_module.model.norm.weight": 16.303817749023438, "gnorm/_forward_module.model.norm.weight": 0.018896114081144333, "pnorm/_forward_module.lm_head.weight": 90.32892608642578, "gnorm/_forward_module.lm_head.weight": 0.19551944732666016} +{"step": 83886080, "pnorm/_forward_module.model.embeddings.weight": 76.44368743896484, "gnorm/_forward_module.model.embeddings.weight": 0.07930849492549896, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.01687240600586, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005513214971870184, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 6.049816131591797, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006496718619018793, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 6.073756217956543, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006670854985713959, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.6009626388549805, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11010660231113434, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.573078155517578, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.13728652894496918, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.8040162324905396, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.015027034096419811, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.012285887263715267, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0029063045512884855, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.12519073486328, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006780239287763834, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 13.761842727661133, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.16053345799446106, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 9.72683048248291, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.11886771023273468, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.95832633972168, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006680781487375498, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.733181953430176, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.010307379066944122, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.701542377471924, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.011362992227077484, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.457224369049072, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1043073832988739, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.485989570617676, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09152856469154358, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.7329626679420471, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008318386971950531, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.012177489697933197, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003090408572461456, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.997032165527344, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.007062331773340702, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.46699047088623, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.15480788052082062, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.57370376586914, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07746879011392593, "pnorm/_forward_module.model.norm.weight": 16.426965713500977, "gnorm/_forward_module.model.norm.weight": 0.019787102937698364, "pnorm/_forward_module.lm_head.weight": 98.42568969726562, "gnorm/_forward_module.lm_head.weight": 0.17325283586978912} +{"step": 104857600, "pnorm/_forward_module.model.embeddings.weight": 78.11286926269531, "gnorm/_forward_module.model.embeddings.weight": 0.16937188804149628, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.053125381469727, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.009213138371706009, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 6.295784950256348, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009306230582296848, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 6.325882911682129, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008344834670424461, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.716864109039307, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1718684285879135, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.674732685089111, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.21003493666648865, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.829669713973999, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.010988417081534863, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.017017638310790062, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0005561288562603295, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.12384605407715, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.008976465091109276, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 13.848369598388672, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.22322280704975128, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 9.823541641235352, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15843050181865692, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.9498291015625, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010773612186312675, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.81465482711792, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.01143778208643198, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.769066333770752, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.019891217350959778, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.470144748687744, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.17998896539211273, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.506825923919678, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.1316508799791336, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.7614491581916809, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.019970055669546127, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.016588453203439713, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0004582771216519177, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.980132102966309, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00909390114247799, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.523462295532227, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.2083943635225296, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.6212739944458, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09244253486394882, "pnorm/_forward_module.model.norm.weight": 16.54207992553711, "gnorm/_forward_module.model.norm.weight": 0.016870953142642975, "pnorm/_forward_module.lm_head.weight": 104.65542602539062, "gnorm/_forward_module.lm_head.weight": 0.12114791572093964} +{"step": 125829120, "pnorm/_forward_module.model.embeddings.weight": 79.67562866210938, "gnorm/_forward_module.model.embeddings.weight": 0.27076810598373413, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.08155059814453, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007802056148648262, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 6.5190958976745605, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02886628918349743, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 6.555778980255127, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02700858935713768, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.798874378204346, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.16170728206634521, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.743980407714844, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1717504858970642, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.8584901094436646, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02907814458012581, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.021339694038033485, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002385492902249098, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.12078094482422, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004597479477524757, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 13.912531852722168, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.11478620022535324, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 9.907936096191406, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07405915856361389, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.948895454406738, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004754727240651846, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.897298812866211, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02250438928604126, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.840423583984375, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.030659666284918785, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.489763259887695, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08445242047309875, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.535558223724365, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06140395253896713, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.7939058542251587, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.04045616090297699, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.023980583995580673, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.004436281975358725, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.969919204711914, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0035182619467377663, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.57705307006836, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.07175486534833908, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.666468620300293, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03958219662308693, "pnorm/_forward_module.model.norm.weight": 16.661285400390625, "gnorm/_forward_module.model.norm.weight": 0.032290779054164886, "pnorm/_forward_module.lm_head.weight": 109.74832153320312, "gnorm/_forward_module.lm_head.weight": 0.14223547279834747} +{"step": 146800640, "pnorm/_forward_module.model.embeddings.weight": 81.10012817382812, "gnorm/_forward_module.model.embeddings.weight": 0.3972567021846771, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.10307502746582, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.011317462660372257, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 6.701213359832764, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.06323665380477905, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 6.745156764984131, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.06785913556814194, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.8558349609375, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.25484606623649597, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.79291296005249, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.28168877959251404, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.894920289516449, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.10865162312984467, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.025242270901799202, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.007006607949733734, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.115859985351562, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007040046155452728, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 13.964837074279785, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.16630573570728302, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 9.97901439666748, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10329962521791458, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.951245307922363, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007102874107658863, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.955022811889648, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.029228556901216507, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.8880157470703125, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.041769176721572876, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.515825271606445, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1492537409067154, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.5728678703308105, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11556188762187958, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.8285028338432312, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.058868274092674255, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.03320124000310898, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00421054195612669, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.9663667678833, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.007122169714421034, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.64005184173584, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.14873653650283813, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.71642780303955, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06751801818609238, "pnorm/_forward_module.model.norm.weight": 16.787813186645508, "gnorm/_forward_module.model.norm.weight": 0.02028297260403633, "pnorm/_forward_module.lm_head.weight": 114.40328216552734, "gnorm/_forward_module.lm_head.weight": 0.16569149494171143} +{"step": 167772160, "pnorm/_forward_module.model.embeddings.weight": 82.40795135498047, "gnorm/_forward_module.model.embeddings.weight": 0.2025238573551178, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.11884880065918, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006921234540641308, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 6.843315124511719, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03999471291899681, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 6.889676570892334, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.04167145863175392, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.891746997833252, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.157960906624794, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.824216365814209, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.18729394674301147, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.9387774467468262, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.06175748631358147, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0299394391477108, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.004272074904292822, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.110490798950195, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005577956326305866, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.008129119873047, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.12842847406864166, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.033875465393066, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.09589163213968277, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.95638656616211, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006718477699905634, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 5.991936206817627, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02173023670911789, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.924596786499023, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.030541986227035522, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.545361042022705, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.14545053243637085, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.615042686462402, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10669176280498505, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.8737874627113342, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.042894329875707626, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.04450457915663719, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.005045986268669367, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.966714859008789, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.006270585581660271, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.714472770690918, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1320708692073822, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.774035453796387, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06698741763830185, "pnorm/_forward_module.model.norm.weight": 16.91901969909668, "gnorm/_forward_module.model.norm.weight": 0.03498150035738945, "pnorm/_forward_module.lm_head.weight": 118.95142364501953, "gnorm/_forward_module.lm_head.weight": 0.13400687277317047} +{"step": 188743680, "pnorm/_forward_module.model.embeddings.weight": 83.62446594238281, "gnorm/_forward_module.model.embeddings.weight": 0.16319693624973297, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.129310607910156, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005269177723675966, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 6.9655537605285645, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013441052287817001, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.010619640350342, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013712843880057335, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.911289691925049, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1497485637664795, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.842068195343018, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1703694462776184, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.9875470399856567, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.010587908327579498, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.03633253648877144, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0012599150650203228, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.10809326171875, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003632865846157074, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.050271034240723, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.09012207388877869, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.078251838684082, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06709024310112, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.966767311096191, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004358288366347551, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.024993896484375, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.013733034953474998, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.951837539672852, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.026359710842370987, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.578583717346191, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1106235533952713, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.663088798522949, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09147733449935913, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.9445444345474243, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.019813520833849907, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.05606655776500702, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002734360285103321, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.967580795288086, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003997805994004011, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.793850898742676, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.07483652234077454, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.824563026428223, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04269333928823471, "pnorm/_forward_module.model.norm.weight": 17.050506591796875, "gnorm/_forward_module.model.norm.weight": 0.024963296949863434, "pnorm/_forward_module.lm_head.weight": 123.4576187133789, "gnorm/_forward_module.lm_head.weight": 0.0917944610118866} +{"step": 209715200, "pnorm/_forward_module.model.embeddings.weight": 84.76009368896484, "gnorm/_forward_module.model.embeddings.weight": 0.21199177205562592, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.13445472717285, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.008684784173965454, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.056536674499512, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.033695612102746964, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.099299430847168, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03227211534976959, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.919482231140137, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.24651245772838593, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.849952220916748, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.29593709111213684, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.025898814201355, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.051366325467824936, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.04175814986228943, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003684332827106118, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.103702545166016, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005320836789906025, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.08340835571289, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.15071050822734833, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.110928535461426, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1516065150499344, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.980895042419434, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011889134533703327, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.048890113830566, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03904469311237335, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.971392631530762, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05655384808778763, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.6173810958862305, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2533377707004547, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.716982841491699, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.17985288798809052, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.0318337678909302, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.05212240293622017, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.06668494641780853, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.005436921026557684, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.971988677978516, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.007158719468861818, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.87979507446289, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.15860435366630554, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.882991790771484, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08661124855279922, "pnorm/_forward_module.model.norm.weight": 17.184024810791016, "gnorm/_forward_module.model.norm.weight": 0.015842411667108536, "pnorm/_forward_module.lm_head.weight": 128.03253173828125, "gnorm/_forward_module.lm_head.weight": 0.10422592610120773} +{"step": 230686720, "pnorm/_forward_module.model.embeddings.weight": 85.82051849365234, "gnorm/_forward_module.model.embeddings.weight": 0.1840544193983078, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.138134002685547, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006066782400012016, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.134839057922363, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.023177703842520714, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.173806667327881, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02293059229850769, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.921664237976074, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.18565498292446136, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.853517055511475, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.21482443809509277, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.0567618608474731, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.024464592337608337, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.047183871269226074, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0026294367853552103, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.098127365112305, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004133800510317087, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.110907554626465, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.10518231242895126, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.134795188903809, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.09959756582975388, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 15.997269630432129, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006588327698409557, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.071419715881348, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02714446745812893, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 5.989881992340088, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.04000772908329964, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.657135963439941, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.16074854135513306, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.769173622131348, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.12271121144294739, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.1232587099075317, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.014214995317161083, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.07616313546895981, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0014605352189391851, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.97897720336914, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0038810824044048786, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 13.971722602844238, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.08428943902254105, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.939470291137695, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05467931926250458, "pnorm/_forward_module.model.norm.weight": 17.3179988861084, "gnorm/_forward_module.model.norm.weight": 0.028649963438510895, "pnorm/_forward_module.lm_head.weight": 132.6510009765625, "gnorm/_forward_module.lm_head.weight": 0.08684412389993668} +{"step": 251658240, "pnorm/_forward_module.model.embeddings.weight": 86.80840301513672, "gnorm/_forward_module.model.embeddings.weight": 0.16536845266819, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.142473220825195, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007211247459053993, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.210788726806641, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01950487121939659, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.244894981384277, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.024354835972189903, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.92273473739624, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1915964037179947, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.855989456176758, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2279568612575531, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.0919400453567505, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01625225879251957, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.052082791924476624, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0016527462285012007, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.092893600463867, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0054925880394876, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.13753604888916, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.12882590293884277, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.154690742492676, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.11033093184232712, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.01178741455078, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008818302303552628, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.10203218460083, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04394018277525902, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.013166904449463, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05901181325316429, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.690598011016846, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1892859786748886, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.813355922698975, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.1347924768924713, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.2036337852478027, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008893336169421673, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.08392015844583511, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007922466029413044, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.98725414276123, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004315048456192017, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.065461158752441, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.11211855709552765, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 9.995488166809082, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06543305516242981, "pnorm/_forward_module.model.norm.weight": 17.45160484313965, "gnorm/_forward_module.model.norm.weight": 0.021481908857822418, "pnorm/_forward_module.lm_head.weight": 137.2794189453125, "gnorm/_forward_module.lm_head.weight": 0.08165688812732697} +{"step": 272629760, "pnorm/_forward_module.model.embeddings.weight": 87.73100280761719, "gnorm/_forward_module.model.embeddings.weight": 0.24232202768325806, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.145559310913086, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.010309348814189434, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.280605792999268, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.027465231716632843, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.310159206390381, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.027734674513339996, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.920651912689209, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2892310917377472, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.855778217315674, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3249877095222473, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.1241724491119385, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.021279318258166313, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.05778597667813301, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0024189064279198647, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.088748931884766, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.008235842920839787, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.165690422058105, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2035948932170868, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.17336654663086, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1687595397233963, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.025407791137695, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010686378926038742, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.143698215484619, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.044652096927165985, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.044376850128174, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07976394146680832, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.718111038208008, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.27623194456100464, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.851855278015137, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.16540488600730896, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.272892951965332, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.020671509206295013, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.09083624929189682, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00290342653170228, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 15.999034881591797, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0050935824401676655, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.164361000061035, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.12197272479534149, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.057548522949219, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08286716789007187, "pnorm/_forward_module.model.norm.weight": 17.586463928222656, "gnorm/_forward_module.model.norm.weight": 0.027356814593076706, "pnorm/_forward_module.lm_head.weight": 141.8426513671875, "gnorm/_forward_module.lm_head.weight": 0.08379202336072922} +{"step": 293601280, "pnorm/_forward_module.model.embeddings.weight": 88.5937271118164, "gnorm/_forward_module.model.embeddings.weight": 0.23741649091243744, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.145029067993164, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.008575675077736378, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.3357462882995605, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03034127689898014, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.363217830657959, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.026033489033579826, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.914319038391113, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.23114928603172302, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.851653575897217, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.26766741275787354, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.1541799306869507, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03323756903409958, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.06321863830089569, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002141643315553665, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.087194442749023, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0049826595932245255, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.195695877075195, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1433328092098236, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.191606521606445, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.12941396236419678, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.041637420654297, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010541539639234543, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.202450275421143, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04479234665632248, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.087597846984863, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07001674920320511, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.74199104309082, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.25496232509613037, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.888543128967285, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.15379978716373444, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.3386846780776978, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.026703402400016785, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.09742366522550583, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002520937006920576, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.015594482421875, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00707329111173749, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.271105766296387, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1697213500738144, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.123900413513184, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.1117330938577652, "pnorm/_forward_module.model.norm.weight": 17.723388671875, "gnorm/_forward_module.model.norm.weight": 0.01274515874683857, "pnorm/_forward_module.lm_head.weight": 146.3300323486328, "gnorm/_forward_module.lm_head.weight": 0.0890781506896019} +{"step": 314572800, "pnorm/_forward_module.model.embeddings.weight": 89.4027328491211, "gnorm/_forward_module.model.embeddings.weight": 0.25835156440734863, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.147722244262695, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.01049479003995657, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.391076564788818, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.024725673720240593, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.416975498199463, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.027991456910967827, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.912836074829102, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.33850523829460144, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.851629734039307, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3602639138698578, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.1793168783187866, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.028827449306845665, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.06770110875368118, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002850713673979044, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.083271026611328, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00572655163705349, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.21980094909668, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.17735016345977783, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.204890251159668, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1820315718650818, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.056869506835938, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.013876463286578655, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.265024662017822, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03723054379224777, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.132700443267822, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06430535763502121, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.761787414550781, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.3246208727359772, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.918417930603027, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.17749464511871338, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.392572045326233, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.015143428929150105, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.10286345332860947, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0017571203643456101, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.03017807006836, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.005847745109349489, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.36701488494873, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.13090787827968597, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.178596496582031, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09486467391252518, "pnorm/_forward_module.model.norm.weight": 17.855567932128906, "gnorm/_forward_module.model.norm.weight": 0.023925915360450745, "pnorm/_forward_module.lm_head.weight": 150.6578369140625, "gnorm/_forward_module.lm_head.weight": 0.10077065974473953} +{"step": 335544320, "pnorm/_forward_module.model.embeddings.weight": 90.16848754882812, "gnorm/_forward_module.model.embeddings.weight": 0.22858496010303497, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.14771842956543, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007369040045887232, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.438493728637695, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.019021548330783844, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.462460041046143, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02120693400502205, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.908205509185791, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.22637203335762024, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.848961353302002, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2266196757555008, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.20077383518219, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.025637401267886162, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.07301612198352814, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.004814534913748503, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.08165168762207, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0036457194946706295, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.245586395263672, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.10910625010728836, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.218076705932617, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0956534594297409, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.07406997680664, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006100333295762539, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.333832740783691, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.023793961852788925, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.182105541229248, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.040050458163022995, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.78040885925293, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.15824654698371887, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.947714328765869, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10512601584196091, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.448504090309143, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01833057776093483, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.10900429636240005, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002136590890586376, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.04541015625, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003999635577201843, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.459443092346191, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09439331293106079, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.229787826538086, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05881432443857193, "pnorm/_forward_module.model.norm.weight": 17.988439559936523, "gnorm/_forward_module.model.norm.weight": 0.017126301303505898, "pnorm/_forward_module.lm_head.weight": 154.8427276611328, "gnorm/_forward_module.lm_head.weight": 0.06683126837015152} +{"step": 356515840, "pnorm/_forward_module.model.embeddings.weight": 90.89396667480469, "gnorm/_forward_module.model.embeddings.weight": 0.14121636748313904, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.14615249633789, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006318354979157448, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.479001045227051, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.018388787284493446, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.5011372566223145, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.019070573151111603, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.901999473571777, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1914856731891632, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.84480094909668, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2117525041103363, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2185012102127075, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.016329944133758545, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.07806332409381866, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001482911640778184, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.079240798950195, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0033970747608691454, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.268656730651855, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.10296657681465149, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.228809356689453, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.09591896831989288, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.09156036376953, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006628477014601231, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.41030216217041, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02173665538430214, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.23518180847168, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.03760357201099396, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.798464775085449, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.15050970017910004, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 5.976438045501709, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09879400581121445, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.4820362329483032, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007722839713096619, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.11268781125545502, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005766506073996425, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.062759399414062, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0028284413274377584, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.551603317260742, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.07456057518720627, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.282119750976562, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04918181523680687, "pnorm/_forward_module.model.norm.weight": 18.122987747192383, "gnorm/_forward_module.model.norm.weight": 0.01923101767897606, "pnorm/_forward_module.lm_head.weight": 158.89418029785156, "gnorm/_forward_module.lm_head.weight": 0.0542321652173996} +{"step": 377487360, "pnorm/_forward_module.model.embeddings.weight": 91.5799789428711, "gnorm/_forward_module.model.embeddings.weight": 0.23439696431159973, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.14375877380371, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.009188073687255383, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.512201309204102, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.025147074833512306, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.532954216003418, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.028735561296343803, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.89562463760376, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.28092071413993835, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.840109348297119, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.31297871470451355, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.234147310256958, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03071645088493824, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.08346227556467056, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0014316203305497766, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.078081130981445, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005763879511505365, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.292433738708496, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.15703894197940826, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.239374160766602, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15120942890644073, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.108749389648438, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011317712254822254, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.4871721267700195, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03530982881784439, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.287702560424805, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0693540871143341, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.814743995666504, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2719857394695282, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.002852916717529, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.17057110369205475, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.5213823318481445, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.018462006002664566, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.11707223951816559, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0016707928152754903, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.08072853088379, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.006646385416388512, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.639954566955566, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.17213216423988342, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.332590103149414, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.10172130167484283, "pnorm/_forward_module.model.norm.weight": 18.256534576416016, "gnorm/_forward_module.model.norm.weight": 0.024371275678277016, "pnorm/_forward_module.lm_head.weight": 162.77151489257812, "gnorm/_forward_module.lm_head.weight": 0.09735913574695587} +{"step": 398458880, "pnorm/_forward_module.model.embeddings.weight": 92.2322769165039, "gnorm/_forward_module.model.embeddings.weight": 0.24870066344738007, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.142486572265625, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.009367205202579498, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.544608116149902, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.0271880142390728, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.565149307250977, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.025796176865696907, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.890977382659912, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.303481787443161, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.837011814117432, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3376403748989105, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2450917959213257, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03664330393075943, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.08760926127433777, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001996302045881748, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.075708389282227, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005454336758702993, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.311745643615723, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.16796565055847168, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.246405601501465, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15342512726783752, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.12604522705078, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011552946642041206, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.563150405883789, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.033611755818128586, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.337704658508301, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06691266596317291, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.8299784660339355, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.25780341029167175, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.02716588973999, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.14455090463161469, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.5584321022033691, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00997294019907713, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.12108348309993744, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010004936484619975, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.094831466674805, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004815593361854553, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.715835571289062, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.12684142589569092, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.372509956359863, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08536971360445023, "pnorm/_forward_module.model.norm.weight": 18.387067794799805, "gnorm/_forward_module.model.norm.weight": 0.014719798229634762, "pnorm/_forward_module.lm_head.weight": 166.4724578857422, "gnorm/_forward_module.lm_head.weight": 0.060370054095983505} +{"step": 419430400, "pnorm/_forward_module.model.embeddings.weight": 92.85285186767578, "gnorm/_forward_module.model.embeddings.weight": 0.19071532785892487, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.13833236694336, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006551853846758604, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.574807167053223, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03646489977836609, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.594360828399658, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03265569731593132, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.882615566253662, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.20909512042999268, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.830440044403076, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.22340095043182373, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2534639835357666, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.04857994243502617, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.09184589236974716, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0036323009990155697, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.0740909576416, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003265694947913289, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.330657958984375, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1142619177699089, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.253521919250488, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10754545778036118, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.145389556884766, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006775465793907642, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.639438629150391, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.023652901872992516, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.387392997741699, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.040627703070640564, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.846147060394287, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.17154112458229065, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.052713871002197, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10318370163440704, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.5926802158355713, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008532107807695866, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.12432464957237244, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007145332056097686, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.112295150756836, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0034234696067869663, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.793949127197266, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09382154792547226, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.415181159973145, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06875316798686981, "pnorm/_forward_module.model.norm.weight": 18.51799201965332, "gnorm/_forward_module.model.norm.weight": 0.017061227932572365, "pnorm/_forward_module.lm_head.weight": 169.9793243408203, "gnorm/_forward_module.lm_head.weight": 0.051629096269607544} +{"step": 440401920, "pnorm/_forward_module.model.embeddings.weight": 93.44153594970703, "gnorm/_forward_module.model.embeddings.weight": 0.20235177874565125, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.13605308532715, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.008770341984927654, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.6050615310668945, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.024122925475239754, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.622251987457275, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.026749055832624435, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.876957893371582, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.29352521896362305, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.825939655303955, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.31368008255958557, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2623562812805176, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.027939561754465103, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.09574870020151138, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0027158609591424465, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.071765899658203, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004416641313582659, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.348091125488281, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.15586477518081665, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.260726928710938, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.16134996712207794, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.165922164916992, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.01028165128082037, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.725041389465332, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03005221113562584, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.4409894943237305, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05953496694564819, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.860496997833252, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.27338314056396484, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.075718402862549, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.1508130133152008, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.6332329511642456, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.015237169340252876, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.12775494158267975, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0019553194288164377, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.127439498901367, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.005100429989397526, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.864448547363281, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1321902871131897, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.453117370605469, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08750463277101517, "pnorm/_forward_module.model.norm.weight": 18.64663314819336, "gnorm/_forward_module.model.norm.weight": 0.016924038529396057, "pnorm/_forward_module.lm_head.weight": 173.29327392578125, "gnorm/_forward_module.lm_head.weight": 0.0799047201871872} +{"step": 461373440, "pnorm/_forward_module.model.embeddings.weight": 94.00251007080078, "gnorm/_forward_module.model.embeddings.weight": 0.18276256322860718, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.131418228149414, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006807558238506317, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.629365921020508, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.027026822790503502, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.645265102386475, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02589160017669201, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.86925745010376, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2216644287109375, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.819730758666992, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.24460923671722412, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2663227319717407, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.04569048061966896, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.099234439432621, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0036534531973302364, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.069828033447266, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003797166980803013, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.364992141723633, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.12680019438266754, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.268211364746094, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10956178605556488, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.188566207885742, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007756480481475592, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.8157525062561035, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.022618860006332397, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.496270179748535, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.047540925443172455, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.875314712524414, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1739145815372467, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.0987701416015625, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10531287640333176, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.6767663955688477, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011981161311268806, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13121762871742249, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0013962923549115658, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.14323616027832, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0035912555176764727, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.932988166809082, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09816188365221024, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.490575790405273, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06795324385166168, "pnorm/_forward_module.model.norm.weight": 18.774126052856445, "gnorm/_forward_module.model.norm.weight": 0.017625225707888603, "pnorm/_forward_module.lm_head.weight": 176.4181365966797, "gnorm/_forward_module.lm_head.weight": 0.060488589107990265} +{"step": 482344960, "pnorm/_forward_module.model.embeddings.weight": 94.53687286376953, "gnorm/_forward_module.model.embeddings.weight": 0.23293745517730713, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.12751007080078, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.00979567039757967, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.652945518493652, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02339017391204834, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.6681342124938965, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.026332011446356773, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.862423896789551, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.33439433574676514, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.814183712005615, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3625766336917877, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2706624269485474, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.029478365555405617, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1021341010928154, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0034781433641910553, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.06745147705078, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0056021190248429775, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.380741119384766, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.18623214960098267, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.27484130859375, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1691756397485733, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.212291717529297, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011105773039162159, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 6.91453218460083, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.035617899149656296, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.5556511878967285, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07290422171354294, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.889091491699219, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.30362197756767273, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.120240688323975, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.16175997257232666, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.7157282829284668, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01843256689608097, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13419915735721588, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0028629445005208254, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.15849494934082, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.006687602493911982, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 14.9977388381958, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.16764725744724274, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.527640342712402, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.12748929858207703, "pnorm/_forward_module.model.norm.weight": 18.901926040649414, "gnorm/_forward_module.model.norm.weight": 0.020733606070280075, "pnorm/_forward_module.lm_head.weight": 179.38880920410156, "gnorm/_forward_module.lm_head.weight": 0.08528453856706619} +{"step": 503316480, "pnorm/_forward_module.model.embeddings.weight": 95.04638671875, "gnorm/_forward_module.model.embeddings.weight": 0.15532135963439941, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.122041702270508, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006966985296458006, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.672325134277344, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02717134915292263, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.686349868774414, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02666059508919716, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.85491418838501, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.23515468835830688, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.807901859283447, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2511366605758667, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2733782529830933, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.04113561660051346, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.10550642013549805, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0022787144407629967, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.065458297729492, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003960816189646721, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.395707130432129, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.13061442971229553, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.28096866607666, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.11754515767097473, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.23788070678711, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007954106666147709, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.015314102172852, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.025775950402021408, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.61585807800293, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.047980569303035736, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.903748512268066, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1957806497812271, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.142185688018799, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11624333262443542, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.7581123113632202, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.014558468945324421, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13758933544158936, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001041799783706665, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.173145294189453, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004071169067174196, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.0595121383667, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.10523778945207596, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.559745788574219, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06728274375200272, "pnorm/_forward_module.model.norm.weight": 19.026662826538086, "gnorm/_forward_module.model.norm.weight": 0.013171317987143993, "pnorm/_forward_module.lm_head.weight": 182.18899536132812, "gnorm/_forward_module.lm_head.weight": 0.05917133390903473} +{"step": 524288000, "pnorm/_forward_module.model.embeddings.weight": 95.53457641601562, "gnorm/_forward_module.model.embeddings.weight": 0.22034896910190582, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.116127014160156, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.008337992243468761, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.689245700836182, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03099421039223671, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.702913761138916, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03153468668460846, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.847163677215576, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.27972444891929626, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.801424026489258, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.29661962389945984, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.276160717010498, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03266516327857971, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1089356318116188, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.005590545944869518, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.06358528137207, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00514281215146184, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.4102201461792, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1522119790315628, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.286970138549805, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.13961637020111084, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.26641845703125, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.009095106273889542, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.124856472015381, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.028037674725055695, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.680570125579834, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06301098316907883, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.917715549468994, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.21195439994335175, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.164060115814209, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.12783999741077423, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.8040308952331543, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01214833278208971, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.14120244979858398, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0016094680177047849, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.186891555786133, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.005583187565207481, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.116930961608887, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1352151781320572, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.589425086975098, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09546215087175369, "pnorm/_forward_module.model.norm.weight": 19.14858627319336, "gnorm/_forward_module.model.norm.weight": 0.017785709351301193, "pnorm/_forward_module.lm_head.weight": 184.8389434814453, "gnorm/_forward_module.lm_head.weight": 0.07949421554803848} +{"step": 545259520, "pnorm/_forward_module.model.embeddings.weight": 96.00264739990234, "gnorm/_forward_module.model.embeddings.weight": 0.20117095112800598, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.10902976989746, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0071280295960605145, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.7032856941223145, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.020687788724899292, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.715653419494629, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.021103909239172935, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.838868618011475, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.23939353227615356, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.793953895568848, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.24034568667411804, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2780157327651978, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.023903880268335342, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.11243792623281479, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0019787985365837812, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.060651779174805, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0035628043115139008, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.422528266906738, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.12547312676906586, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.291552543640137, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1071537435054779, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.298749923706055, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007662015035748482, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.2497687339782715, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02259899117052555, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.754214286804199, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.04498670995235443, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.932610988616943, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.17681235074996948, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.187340259552002, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09144915640354156, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.8466691970825195, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01845640130341053, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1444997489452362, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0028923251666128635, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.200286865234375, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0033834974747151136, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.170270919799805, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.08719339221715927, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.617233276367188, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06491568684577942, "pnorm/_forward_module.model.norm.weight": 19.269508361816406, "gnorm/_forward_module.model.norm.weight": 0.01581449992954731, "pnorm/_forward_module.lm_head.weight": 187.33938598632812, "gnorm/_forward_module.lm_head.weight": 0.05652138963341713} +{"step": 566231040, "pnorm/_forward_module.model.embeddings.weight": 96.45067596435547, "gnorm/_forward_module.model.embeddings.weight": 0.22004501521587372, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.103315353393555, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.009375354275107384, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.717862606048584, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.025473136454820633, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.729610443115234, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02675601840019226, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.833230018615723, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3289332389831543, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.789061069488525, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3475860059261322, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.278484582901001, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02552069164812565, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.11501074582338333, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003064599819481373, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.05790901184082, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006465013138949871, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.434786796569824, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.18133848905563354, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.296317100524902, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.16377007961273193, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.331037521362305, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.009125569835305214, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.376690864562988, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03462328016757965, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.830111503601074, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06784458458423615, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.947010517120361, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2766604721546173, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.20965576171875, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.15607833862304688, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.887609601020813, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.016413796693086624, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.14778995513916016, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0019539541099220514, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.211517333984375, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.007291567046195269, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.217272758483887, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.17318810522556305, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.641464233398438, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.11966361105442047, "pnorm/_forward_module.model.norm.weight": 19.38921546936035, "gnorm/_forward_module.model.norm.weight": 0.017861122265458107, "pnorm/_forward_module.lm_head.weight": 189.70346069335938, "gnorm/_forward_module.lm_head.weight": 0.08456363528966904} +{"step": 587202560, "pnorm/_forward_module.model.embeddings.weight": 96.88018798828125, "gnorm/_forward_module.model.embeddings.weight": 0.1735384315252304, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.097389221191406, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007479378953576088, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.726406097412109, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.022002913057804108, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.738077640533447, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.022471435368061066, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.828475475311279, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2592656910419464, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.784884929656982, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2785628139972687, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.278551459312439, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02430613711476326, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.11785029619932175, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002355094300583005, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.053321838378906, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0037969087716192007, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.444052696228027, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.14658933877944946, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.299330711364746, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.12882286310195923, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.363475799560547, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.00909586250782013, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.5015716552734375, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.023639777675271034, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.903359889984131, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05382176861166954, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.961294651031494, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.22143489122390747, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.2315993309021, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11884113401174545, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.923274040222168, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01303075347095728, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15039731562137604, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0018774199998006225, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.220705032348633, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0044664801098406315, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.259578704833984, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.12267091870307922, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.66280746459961, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09408437460660934, "pnorm/_forward_module.model.norm.weight": 19.506816864013672, "gnorm/_forward_module.model.norm.weight": 0.008999792858958244, "pnorm/_forward_module.lm_head.weight": 191.95277404785156, "gnorm/_forward_module.lm_head.weight": 0.05207791551947594} +{"step": 608174080, "pnorm/_forward_module.model.embeddings.weight": 97.29241180419922, "gnorm/_forward_module.model.embeddings.weight": 0.20382501184940338, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.090051651000977, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007904560305178165, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.730472087860107, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.027826420962810516, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.74221134185791, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03123570792376995, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.822511672973633, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.306417316198349, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.7796220779418945, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.335592657327652, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2775789499282837, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.023581864312291145, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.12047639489173889, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0016218681121245027, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.05027198791504, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005104148294776678, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.454352378845215, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1775524914264679, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.303309440612793, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1624232977628708, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.39799690246582, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010691205970942974, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.634333610534668, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.029091862961649895, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 6.980170249938965, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06323990225791931, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.974838733673096, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.25570929050445557, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.252712249755859, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.13154585659503937, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.9607007503509521, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.018456030637025833, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15323762595653534, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00257876212708652, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.23038101196289, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004449437838047743, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.300716400146484, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.12886248528957367, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.684063911437988, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09636989235877991, "pnorm/_forward_module.model.norm.weight": 19.6219425201416, "gnorm/_forward_module.model.norm.weight": 0.011629029177129269, "pnorm/_forward_module.lm_head.weight": 194.076904296875, "gnorm/_forward_module.lm_head.weight": 0.0853302851319313} +{"step": 629145600, "pnorm/_forward_module.model.embeddings.weight": 97.68669891357422, "gnorm/_forward_module.model.embeddings.weight": 0.1636805683374405, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.081573486328125, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007223108317703009, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.732177734375, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.026001984253525734, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.744466781616211, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.027224862948060036, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.816336154937744, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2571079134941101, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.774085998535156, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.27747640013694763, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2768009901046753, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.028504854068160057, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.12301230430603027, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002434689551591873, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.04452133178711, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004219732712954283, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.460796356201172, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1512661576271057, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.30492877960205, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.13173426687717438, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.435611724853516, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008819940499961376, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.773662567138672, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.024418124929070473, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.059475898742676, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05566051974892616, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 5.9901018142700195, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.20991916954517365, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.2763166427612305, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11321206390857697, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.0000133514404297, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.019851189106702805, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1561778336763382, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002848095027729869, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.23982048034668, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004457150120288134, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.340049743652344, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.11663961410522461, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.703718185424805, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07951927930116653, "pnorm/_forward_module.model.norm.weight": 19.735898971557617, "gnorm/_forward_module.model.norm.weight": 0.010668852366507053, "pnorm/_forward_module.lm_head.weight": 196.0958251953125, "gnorm/_forward_module.lm_head.weight": 0.07121716439723969} +{"step": 650117120, "pnorm/_forward_module.model.embeddings.weight": 98.06478881835938, "gnorm/_forward_module.model.embeddings.weight": 0.15938271582126617, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.07219123840332, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006006039213389158, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.731297492980957, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02748878486454487, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.744622707366943, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.026533981785178185, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.8102545738220215, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.20619338750839233, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.768701553344727, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.22212035953998566, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2742520570755005, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.034607067704200745, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.12589605152606964, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003589056199416518, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.038936614990234, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003471721662208438, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.466378211975098, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.11965934187173843, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.306475639343262, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.09877663105726242, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.475366592407227, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006739410571753979, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 7.9197893142700195, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.021391453221440315, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.140645980834961, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05054205656051636, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.005711555480957, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.14548586308956146, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.300064563751221, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08618379384279251, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.037837505340576, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013110117986798286, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1591188907623291, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001963021932169795, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.248573303222656, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003548850305378437, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.376485824584961, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.093282550573349, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.721556663513184, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07089327275753021, "pnorm/_forward_module.model.norm.weight": 19.848438262939453, "gnorm/_forward_module.model.norm.weight": 0.012325823307037354, "pnorm/_forward_module.lm_head.weight": 198.01234436035156, "gnorm/_forward_module.lm_head.weight": 0.050430234521627426} +{"step": 671088640, "pnorm/_forward_module.model.embeddings.weight": 98.42638397216797, "gnorm/_forward_module.model.embeddings.weight": 0.2023174911737442, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.06395721435547, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.009707758203148842, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.729985237121582, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.029261939227581024, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.743839740753174, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02807682193815708, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.806092262268066, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3390304744243622, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.76533317565918, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.35863423347473145, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2732795476913452, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03094956837594509, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1287548989057541, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0021958923898637295, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.031354904174805, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006607855204492807, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.468613624572754, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1986837089061737, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.30615520477295, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1711643934249878, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.51612091064453, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.012049910612404346, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 8.071556091308594, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02388971857726574, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.224215984344482, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06019391119480133, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.020712375640869, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2963774502277374, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.3237504959106445, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.12947744131088257, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.0721936225891113, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013252650387585163, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16182689368724823, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0019557576160877943, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.25555419921875, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003794329008087516, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.408302307128906, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.10745280236005783, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.736898422241211, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07972017675638199, "pnorm/_forward_module.model.norm.weight": 19.959779739379883, "gnorm/_forward_module.model.norm.weight": 0.01226205937564373, "pnorm/_forward_module.lm_head.weight": 199.82618713378906, "gnorm/_forward_module.lm_head.weight": 0.056313905864953995} +{"step": 692060160, "pnorm/_forward_module.model.embeddings.weight": 98.77183532714844, "gnorm/_forward_module.model.embeddings.weight": 0.20573918521404266, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.05557632446289, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007639171089977026, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.727658748626709, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02827189303934574, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.742532730102539, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03340359032154083, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.8024187088012695, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.27409180998802185, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.762091636657715, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3130691945552826, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2716904878616333, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0304715633392334, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.13165368139743805, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0041915299370884895, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.024314880371094, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0062354994006454945, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.470952987670898, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.18471813201904297, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.306315422058105, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.14491578936576843, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.55607795715332, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010120926424860954, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 8.225302696228027, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03127526864409447, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.308112144470215, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07668560743331909, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.035125255584717, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.22492751479148865, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.347119331359863, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.12363525480031967, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1043412685394287, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0241320189088583, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16428865492343903, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002711901906877756, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.26156997680664, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.006169233471155167, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.43750286102295, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.14955765008926392, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.751347541809082, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.10657127946615219, "pnorm/_forward_module.model.norm.weight": 20.069307327270508, "gnorm/_forward_module.model.norm.weight": 0.014878339134156704, "pnorm/_forward_module.lm_head.weight": 201.55836486816406, "gnorm/_forward_module.lm_head.weight": 0.08472712337970734} +{"step": 713031680, "pnorm/_forward_module.model.embeddings.weight": 99.1022720336914, "gnorm/_forward_module.model.embeddings.weight": 0.22916169464588165, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.04576301574707, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.010042784735560417, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.722154140472412, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.041266195476055145, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.7379021644592285, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.04137364402413368, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.797547817230225, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3477133810520172, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.757993221282959, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.4053873121738434, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2712900638580322, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.04868757352232933, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.13438837230205536, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003433995181694627, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.01668357849121, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007419743575155735, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.471571922302246, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.22989535331726074, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.305148124694824, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.18718764185905457, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.597389221191406, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011514841578900814, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 8.378793716430664, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.037647731602191925, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.390815258026123, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.10081840306520462, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.050039768218994, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.3397084176540375, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.371720790863037, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.18449492752552032, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1350040435791016, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.019360367208719254, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16659072041511536, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0018619300099089742, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.268321990966797, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.010180658660829067, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.466082572937012, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.24584978818893433, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.766352653503418, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.17454735934734344, "pnorm/_forward_module.model.norm.weight": 20.177553176879883, "gnorm/_forward_module.model.norm.weight": 0.018703162670135498, "pnorm/_forward_module.lm_head.weight": 203.2186737060547, "gnorm/_forward_module.lm_head.weight": 0.11267819255590439} +{"step": 734003200, "pnorm/_forward_module.model.embeddings.weight": 99.41828155517578, "gnorm/_forward_module.model.embeddings.weight": 0.16045638918876648, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.03580665588379, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006328540854156017, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.713886737823486, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.022718990221619606, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.730217933654785, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02344977669417858, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.793262958526611, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.26672127842903137, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.754383087158203, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.30776122212409973, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2701600790023804, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.027197683230042458, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.13735666871070862, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0017019683727994561, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 16.00797462463379, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005435958504676819, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.469038009643555, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.17133906483650208, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.302346229553223, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.13347741961479187, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.640018463134766, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.00862868782132864, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 8.534324645996094, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02631223015487194, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.474114894866943, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06729573011398315, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.065020561218262, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2130332589149475, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.396427631378174, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11398731917142868, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1629278659820557, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01139635406434536, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16882263123989105, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0012246110709384084, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.273733139038086, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004698020406067371, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.491894721984863, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.12458794564008713, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.77895736694336, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09849343448877335, "pnorm/_forward_module.model.norm.weight": 20.28345489501953, "gnorm/_forward_module.model.norm.weight": 0.014805878512561321, "pnorm/_forward_module.lm_head.weight": 204.81576538085938, "gnorm/_forward_module.lm_head.weight": 0.06694518774747849} +{"step": 754974720, "pnorm/_forward_module.model.embeddings.weight": 99.71966552734375, "gnorm/_forward_module.model.embeddings.weight": 0.15029793977737427, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.025001525878906, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006898676976561546, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.7045488357543945, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02405928261578083, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.721875190734863, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02591676451265812, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.788855075836182, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.24748513102531433, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.751063346862793, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2838912904262543, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.269788384437561, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.025977246463298798, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.14054937660694122, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0035993424244225025, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.999175071716309, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0045345136895775795, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.466290473937988, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.15724687278270721, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.2999906539917, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1246451810002327, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.684524536132812, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008167837746441364, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 8.692224502563477, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.021852541714906693, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.55929708480835, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05597616732120514, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.080973148345947, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.18784654140472412, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.4225172996521, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09960131347179413, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.191275119781494, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.017015784978866577, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1711260825395584, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0026595895178616047, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.277015686035156, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003529248759150505, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.512235641479492, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09780817478895187, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.789041519165039, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07379096001386642, "pnorm/_forward_module.model.norm.weight": 20.387346267700195, "gnorm/_forward_module.model.norm.weight": 0.014528797939419746, "pnorm/_forward_module.lm_head.weight": 206.3470458984375, "gnorm/_forward_module.lm_head.weight": 0.046683263033628464} +{"step": 775946240, "pnorm/_forward_module.model.embeddings.weight": 100.00704956054688, "gnorm/_forward_module.model.embeddings.weight": 0.2941301465034485, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.014699935913086, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.011541558429598808, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.695476531982422, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03180893138051033, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.7142415046691895, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03302828222513199, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.785155296325684, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.45532307028770447, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.748058795928955, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.5084837675094604, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2693874835968018, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.029900360852479935, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.14333626627922058, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0029146773740649223, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.988802909851074, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.008055591024458408, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.461150169372559, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.278753399848938, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.29558277130127, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.23493948578834534, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.728219985961914, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.017841152846813202, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 8.846673965454102, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.036290764808654785, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.64326286315918, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.11306578665971756, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.096703052520752, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.44159573316574097, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.448803901672363, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.173120379447937, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2153208255767822, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009259268641471863, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1729714721441269, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005688453675247729, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.28045654296875, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0070902579464018345, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.531828880310059, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.19970548152923584, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.799120903015137, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.15621861815452576, "pnorm/_forward_module.model.norm.weight": 20.489822387695312, "gnorm/_forward_module.model.norm.weight": 0.008953014388680458, "pnorm/_forward_module.lm_head.weight": 207.82281494140625, "gnorm/_forward_module.lm_head.weight": 0.08722332864999771} +{"step": 796917760, "pnorm/_forward_module.model.embeddings.weight": 100.27958679199219, "gnorm/_forward_module.model.embeddings.weight": 0.1832289844751358, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 16.005470275878906, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0069400048814713955, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.685826301574707, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03822636604309082, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.706052303314209, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.033523332327604294, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.782008647918701, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2706996500492096, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.745550632476807, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.33682262897491455, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.268787145614624, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.05000722035765648, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.14555391669273376, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0057035330682992935, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.978593826293945, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006318510975688696, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.455473899841309, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2062811255455017, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.291293144226074, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15104669332504272, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.768680572509766, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.009643254801630974, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 8.991643905639648, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.028506657108664513, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.719318389892578, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07179594039916992, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.110976219177246, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2041086107492447, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.473086357116699, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10774476826190948, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.241485357284546, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00873855222016573, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.17500139772891998, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007690155762247741, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.283525466918945, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.005600798409432173, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.550729751586914, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.12909932434558868, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.808244705200195, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.1004580557346344, "pnorm/_forward_module.model.norm.weight": 20.588937759399414, "gnorm/_forward_module.model.norm.weight": 0.009729093872010708, "pnorm/_forward_module.lm_head.weight": 209.23521423339844, "gnorm/_forward_module.lm_head.weight": 0.05669531598687172} +{"step": 817889280, "pnorm/_forward_module.model.embeddings.weight": 100.53945922851562, "gnorm/_forward_module.model.embeddings.weight": 0.2048853039741516, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.994409561157227, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.009433403611183167, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.67338752746582, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.034393519163131714, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.694585800170898, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03784063085913658, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.777283191680908, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3501676321029663, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.741778373718262, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.44901904463768005, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2697489261627197, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.04371098056435585, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.14827734231948853, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.006499612703919411, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.968451499938965, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.008888251148164272, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.448187828063965, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2834368944168091, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.286165237426758, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.2052830010652542, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.809907913208008, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.014295912347733974, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.133918762207031, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04031810536980629, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.7953877449035645, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.1146523505449295, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.126290321350098, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.33595117926597595, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.499974250793457, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.15768085420131683, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.259138345718384, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01477715466171503, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.17634497582912445, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0011085477890446782, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.287715911865234, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.008569443598389626, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.570694923400879, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.2262781411409378, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.81857681274414, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.1694159209728241, "pnorm/_forward_module.model.norm.weight": 20.688167572021484, "gnorm/_forward_module.model.norm.weight": 0.007088753394782543, "pnorm/_forward_module.lm_head.weight": 210.61212158203125, "gnorm/_forward_module.lm_head.weight": 0.08050274103879929} +{"step": 838860800, "pnorm/_forward_module.model.embeddings.weight": 100.78669738769531, "gnorm/_forward_module.model.embeddings.weight": 0.1814885139465332, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.98440933227539, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007136930711567402, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.662417888641357, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03654519468545914, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.685204029083252, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.032343778759241104, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.773070335388184, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.26074478030204773, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.738387107849121, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.31438031792640686, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2728660106658936, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03681902587413788, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1507289558649063, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0008591669029556215, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.958623886108398, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0061768339946866035, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.440506935119629, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.18882790207862854, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.280240058898926, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.14375153183937073, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.848556518554688, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.009691792540252209, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.26003360748291, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02757353149354458, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.864212989807129, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06520722061395645, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.1426472663879395, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2088884860277176, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.528327941894531, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09573116898536682, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.273344039916992, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012824106961488724, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.17733317613601685, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00122366554569453, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.28965950012207, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0032275246921926737, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.585335731506348, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09093200415372849, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.825887680053711, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06821323931217194, "pnorm/_forward_module.model.norm.weight": 20.784013748168945, "gnorm/_forward_module.model.norm.weight": 0.010386320762336254, "pnorm/_forward_module.lm_head.weight": 211.93174743652344, "gnorm/_forward_module.lm_head.weight": 0.0450613759458065} +{"step": 859832320, "pnorm/_forward_module.model.embeddings.weight": 101.0207748413086, "gnorm/_forward_module.model.embeddings.weight": 0.18346385657787323, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.974037170410156, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007925605401396751, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.650774002075195, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.029444964602589607, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.67523717880249, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03195954114198685, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.768524169921875, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3616202771663666, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.73456335067749, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.48957133293151855, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2762458324432373, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03917951136827469, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.15295782685279846, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.004500327631831169, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.948342323303223, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.011855985037982464, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.431108474731445, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.3272835612297058, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.273822784423828, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.22896456718444824, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.886762619018555, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.012894567102193832, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.379981994628906, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04459884762763977, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.930870056152344, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.11406629532575607, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.158458709716797, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2992749810218811, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.5570197105407715, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.15945610404014587, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2892327308654785, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.016015561297535896, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.17842204868793488, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0021379359532147646, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.292537689208984, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00965035893023014, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.601076126098633, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.23617495596408844, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.83443832397461, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.17591136693954468, "pnorm/_forward_module.model.norm.weight": 20.879352569580078, "gnorm/_forward_module.model.norm.weight": 0.017016399651765823, "pnorm/_forward_module.lm_head.weight": 213.21156311035156, "gnorm/_forward_module.lm_head.weight": 0.0802975669503212} +{"step": 880803840, "pnorm/_forward_module.model.embeddings.weight": 101.24177551269531, "gnorm/_forward_module.model.embeddings.weight": 0.16789402067661285, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.964214324951172, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0070615303702652454, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.640183448791504, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.026845300570130348, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.66603422164917, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.026619650423526764, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.763975143432617, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.25907203555107117, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.730916976928711, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.32100245356559753, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2792272567749023, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.023407800123095512, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.15512621402740479, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002280850661918521, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.939168930053711, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006012500263750553, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.422830581665039, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.20594081282615662, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.26800537109375, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15959183871746063, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.922321319580078, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010323814116418362, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.49052906036377, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.026982789859175682, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 7.991828918457031, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06679588556289673, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.173741817474365, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.20969340205192566, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.584741592407227, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09153863787651062, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3061506748199463, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01397742424160242, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.17956143617630005, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0021234408486634493, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.294673919677734, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0033097751438617706, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.614846229553223, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09369784593582153, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.8408784866333, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06982149928808212, "pnorm/_forward_module.model.norm.weight": 20.97102165222168, "gnorm/_forward_module.model.norm.weight": 0.010389507748186588, "pnorm/_forward_module.lm_head.weight": 214.4483642578125, "gnorm/_forward_module.lm_head.weight": 0.0400870181620121} +{"step": 901775360, "pnorm/_forward_module.model.embeddings.weight": 101.44911193847656, "gnorm/_forward_module.model.embeddings.weight": 0.20068015158176422, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.955327987670898, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.009524423629045486, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.629510402679443, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.0245619248598814, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.6568474769592285, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02525559812784195, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.759893894195557, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3585737943649292, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.727430820465088, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.4296553134918213, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2833362817764282, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.025101590901613235, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.15728673338890076, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0018407750176265836, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.93018627166748, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.009933617897331715, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.413580894470215, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2766164243221283, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.26155948638916, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.21431373059749603, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.95285415649414, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.01485259085893631, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.58083438873291, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04534691199660301, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.044313430786133, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.09238847345113754, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.187687873840332, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.3050750195980072, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.611330032348633, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.13075096905231476, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.316879987716675, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.017985261976718903, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18027234077453613, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0025518012698739767, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.29717254638672, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00512923626229167, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.628528594970703, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.13240854442119598, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.848450660705566, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09399112313985825, "pnorm/_forward_module.model.norm.weight": 21.06087875366211, "gnorm/_forward_module.model.norm.weight": 0.011922747828066349, "pnorm/_forward_module.lm_head.weight": 215.62326049804688, "gnorm/_forward_module.lm_head.weight": 0.05497278273105621} +{"step": 922746880, "pnorm/_forward_module.model.embeddings.weight": 101.64248657226562, "gnorm/_forward_module.model.embeddings.weight": 0.19082310795783997, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.947284698486328, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007863345555961132, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.6219658851623535, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.021198788657784462, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.6508097648620605, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.021913129836320877, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.755426406860352, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3009895980358124, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.723777770996094, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.381212055683136, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2862111330032349, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02379879727959633, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.15912340581417084, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0016216224757954478, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.921552658081055, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006690130103379488, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.403554916381836, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.24088314175605774, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.254583358764648, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1815287172794342, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 16.98207664489746, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.01192416436970234, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.66530704498291, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.035881854593753815, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.093770027160645, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.09240707755088806, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.20151948928833, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.289766401052475, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.638235569000244, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.13748212158679962, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.326849937438965, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013054713606834412, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18095125257968903, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0018752665491774678, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.299930572509766, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0070016393437981606, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.642023086547852, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.17514893412590027, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.855536460876465, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.13465356826782227, "pnorm/_forward_module.model.norm.weight": 21.14870834350586, "gnorm/_forward_module.model.norm.weight": 0.007644960191100836, "pnorm/_forward_module.lm_head.weight": 216.75759887695312, "gnorm/_forward_module.lm_head.weight": 0.06925247609615326} +{"step": 943718400, "pnorm/_forward_module.model.embeddings.weight": 101.82350158691406, "gnorm/_forward_module.model.embeddings.weight": 0.17356620728969574, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.93988037109375, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007010822184383869, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.614771366119385, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.027681749314069748, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.644596099853516, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.027988221496343613, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.751139163970947, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2861108183860779, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.720247745513916, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.351108193397522, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2911932468414307, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.032926637679338455, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.16129165887832642, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002102445112541318, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.91360092163086, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005902737844735384, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.394438743591309, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2197238802909851, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.248025894165039, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.17487017810344696, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.009201049804688, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.01172678917646408, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.74010181427002, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03258282691240311, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.137377738952637, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07780226320028305, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.214691638946533, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2573513090610504, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.6639604568481445, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11335118114948273, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3389036655426025, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012234704568982124, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18159765005111694, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001714679878205061, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.30173110961914, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.005596610717475414, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.65324592590332, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.14213071763515472, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.861074447631836, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.10636811703443527, "pnorm/_forward_module.model.norm.weight": 21.23310661315918, "gnorm/_forward_module.model.norm.weight": 0.007538175676018, "pnorm/_forward_module.lm_head.weight": 217.8425750732422, "gnorm/_forward_module.lm_head.weight": 0.05098746716976166} +{"step": 964689920, "pnorm/_forward_module.model.embeddings.weight": 101.9930648803711, "gnorm/_forward_module.model.embeddings.weight": 0.20876120030879974, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.932652473449707, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007864234037697315, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.609504699707031, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03434189409017563, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.640050888061523, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03374037146568298, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.746485233306885, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.33770278096199036, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.716505527496338, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.42687711119651794, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2944157123565674, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03733007609844208, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1632675677537918, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.006102441344410181, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.906389236450195, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00912489090114832, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.385597229003906, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2765856385231018, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.241890907287598, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.20960266888141632, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.034765243530273, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.01435005385428667, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.810948371887207, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04217510297894478, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.178857803344727, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.12021086364984512, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.226593017578125, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2998025119304657, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.688065052032471, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.13232867419719696, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3504257202148438, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.023664681240916252, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18241749703884125, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.003028205130249262, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.304214477539062, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.005487241316586733, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.664689064025879, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.15205077826976776, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.86729907989502, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.11423949152231216, "pnorm/_forward_module.model.norm.weight": 21.31633949279785, "gnorm/_forward_module.model.norm.weight": 0.013164620846509933, "pnorm/_forward_module.lm_head.weight": 218.8914337158203, "gnorm/_forward_module.lm_head.weight": 0.09171262383460999} +{"step": 985661440, "pnorm/_forward_module.model.embeddings.weight": 102.14974212646484, "gnorm/_forward_module.model.embeddings.weight": 0.16487039625644684, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.927387237548828, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007290668785572052, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.604660511016846, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.026181118562817574, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.635951995849609, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.023724831640720367, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.743817329406738, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.29293951392173767, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.714225769042969, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.4008455276489258, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.2981969118118286, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03799102082848549, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.16524747014045715, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.004527045879513025, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.90011215209961, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.010372592136263847, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.37763786315918, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.30828744173049927, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.236249923706055, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.22842682898044586, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.054792404174805, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.013232003897428513, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.866479873657227, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.05138188228011131, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.213126182556152, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.10030994564294815, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.236813068389893, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.23962000012397766, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.708548545837402, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0963435173034668, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3546767234802246, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011565030552446842, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18273603916168213, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0012512424727901816, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.305910110473633, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0034745591692626476, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.674751281738281, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1003837138414383, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.871770858764648, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08384432643651962, "pnorm/_forward_module.model.norm.weight": 21.39535903930664, "gnorm/_forward_module.model.norm.weight": 0.00875948742032051, "pnorm/_forward_module.lm_head.weight": 219.88034057617188, "gnorm/_forward_module.lm_head.weight": 0.04462669789791107} +{"step": 1006632960, "pnorm/_forward_module.model.embeddings.weight": 102.29605865478516, "gnorm/_forward_module.model.embeddings.weight": 0.17986048758029938, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.92068099975586, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007733121979981661, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.598412990570068, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.027450526133179665, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.630649566650391, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.026769110932946205, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.739149570465088, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3397288918495178, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.71005392074585, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.4974135160446167, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3009029626846313, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.04087759554386139, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.16694039106369019, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.006430420093238354, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.893454551696777, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.015873372554779053, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.36810302734375, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.40317636728286743, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.229598045349121, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.27948319911956787, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.076663970947266, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.016966352239251137, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.922720909118652, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.06768778711557388, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.246980667114258, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.14271363615989685, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.248295783996582, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.26796334981918335, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.732372760772705, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.12933741509914398, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.361959218978882, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.019832491874694824, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18320026993751526, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0037869815714657307, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.308120727539062, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.008340614847838879, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.684333801269531, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.19745224714279175, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.877388000488281, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.14926695823669434, "pnorm/_forward_module.model.norm.weight": 21.473039627075195, "gnorm/_forward_module.model.norm.weight": 0.01367971207946539, "pnorm/_forward_module.lm_head.weight": 220.82443237304688, "gnorm/_forward_module.lm_head.weight": 0.05586986988782883} +{"step": 1027604480, "pnorm/_forward_module.model.embeddings.weight": 102.4328842163086, "gnorm/_forward_module.model.embeddings.weight": 0.14153797924518585, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.9141206741333, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0053754993714392185, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.593119144439697, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.020610470324754715, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.626016616821289, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02075980231165886, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.733950138092041, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2275284230709076, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.705578804016113, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.31673505902290344, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3042224645614624, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0268938560038805, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.16871759295463562, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0026769316755235195, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.88742733001709, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007082389667630196, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.359513282775879, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.21420371532440186, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.223776817321777, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.14997003972530365, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.098005294799805, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.00915715005248785, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 9.975911140441895, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.027297521010041237, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.279853820800781, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07506737858057022, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.25955057144165, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.21294289827346802, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.756106853485107, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09636084735393524, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3693838119506836, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011870460584759712, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18359726667404175, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0014974857913330197, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.30974578857422, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004895792808383703, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.692460060119629, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.12879852950572968, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.881689071655273, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09795743972063065, "pnorm/_forward_module.model.norm.weight": 21.548620223999023, "gnorm/_forward_module.model.norm.weight": 0.0111366156488657, "pnorm/_forward_module.lm_head.weight": 221.7332763671875, "gnorm/_forward_module.lm_head.weight": 0.0535142719745636} +{"step": 1048576000, "pnorm/_forward_module.model.embeddings.weight": 102.56021118164062, "gnorm/_forward_module.model.embeddings.weight": 0.1693047136068344, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.90796947479248, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007335268892347813, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.590385437011719, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.020377231761813164, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.623666763305664, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02136346697807312, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.728721618652344, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.31203457713127136, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.70127534866333, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.41276323795318604, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3064018487930298, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.024862375110387802, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17037126421928406, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0019539035856723785, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.881587982177734, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.01187108550220728, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.35114860534668, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.319108247756958, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.218084335327148, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.2408890724182129, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.11786460876465, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.016008788719773293, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.025984764099121, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.06185256689786911, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.311248779296875, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.11557826399803162, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.270578384399414, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.30105558037757874, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.779449462890625, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11350803822278976, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3762025833129883, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.014851836487650871, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18399572372436523, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002400873461738229, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.31165313720703, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004692550748586655, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.701004981994629, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.13482093811035156, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.886446952819824, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.1011568009853363, "pnorm/_forward_module.model.norm.weight": 21.62242317199707, "gnorm/_forward_module.model.norm.weight": 0.00772079499438405, "pnorm/_forward_module.lm_head.weight": 222.6094207763672, "gnorm/_forward_module.lm_head.weight": 0.04369160160422325} +{"step": 1069547520, "pnorm/_forward_module.model.embeddings.weight": 102.67849731445312, "gnorm/_forward_module.model.embeddings.weight": 0.19082877039909363, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.90213680267334, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0069014085456728935, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.5854268074035645, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.029337555170059204, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.619521141052246, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02587662823498249, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.724156379699707, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.29395750164985657, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.697343349456787, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3541196882724762, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.308714747428894, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03794652968645096, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17158685624599457, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001444804249331355, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.875810623168945, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006337857339531183, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.342477798461914, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.23104946315288544, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.212176322937012, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.18747903406620026, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.136703491210938, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011965814977884293, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.071950912475586, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.027294037863612175, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.340375900268555, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07525932043790817, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.280647277832031, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.29078295826911926, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.801392555236816, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10596686601638794, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3830528259277344, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0188151765614748, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18432655930519104, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0025812797248363495, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.313566207885742, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0043296655640006065, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.708402633666992, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1137683093547821, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.8912935256958, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08484566956758499, "pnorm/_forward_module.model.norm.weight": 21.694154739379883, "gnorm/_forward_module.model.norm.weight": 0.010983165353536606, "pnorm/_forward_module.lm_head.weight": 223.4422149658203, "gnorm/_forward_module.lm_head.weight": 0.05825776234269142} +{"step": 1090519040, "pnorm/_forward_module.model.embeddings.weight": 102.78712463378906, "gnorm/_forward_module.model.embeddings.weight": 0.20217010378837585, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.895978927612305, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0085111940279603, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.581046104431152, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.025045957416296005, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.615750312805176, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02590840682387352, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.719194412231445, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.33764567971229553, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.693104267120361, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.5232690572738647, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.310657024383545, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.026938248425722122, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17312565445899963, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003408517688512802, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.870555877685547, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.018705032765865326, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.33444595336914, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.49030977487564087, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.206950187683105, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.3770294785499573, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.155467987060547, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.023661073297262192, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.116619110107422, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.11396446079015732, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.367951393127441, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.20313650369644165, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.29062032699585, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2700331211090088, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.823119163513184, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11225222796201706, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.388913869857788, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.02665448561310768, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1846686750650406, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.004765696823596954, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.315032958984375, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.006634844932705164, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.715108871459961, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.15584735572338104, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.895450592041016, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.1280859261751175, "pnorm/_forward_module.model.norm.weight": 21.763442993164062, "gnorm/_forward_module.model.norm.weight": 0.012544789351522923, "pnorm/_forward_module.lm_head.weight": 224.2252655029297, "gnorm/_forward_module.lm_head.weight": 0.06585974246263504} +{"step": 1111490560, "pnorm/_forward_module.model.embeddings.weight": 102.88382720947266, "gnorm/_forward_module.model.embeddings.weight": 0.2501932978630066, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.893465995788574, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.01106497272849083, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.581554412841797, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.03750867024064064, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.616550922393799, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.03368225321173668, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.7172017097473145, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.44433897733688354, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.691493988037109, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.5391713976860046, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3124750852584839, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.049982912838459015, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1742357760667801, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00895225815474987, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.866301536560059, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.011020827107131481, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.327747344970703, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.3501618206501007, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.202301025390625, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.2728751599788666, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.168426513671875, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.017109055072069168, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.14897632598877, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.05087071284651756, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.388800621032715, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.1033000573515892, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.297657012939453, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.3988055884838104, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.837750434875488, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.15205317735671997, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3945491313934326, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.018211262300610542, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18504619598388672, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0021102491300553083, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.3171443939209, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00656572449952364, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.723258018493652, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1738688349723816, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.899774551391602, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.12663249671459198, "pnorm/_forward_module.model.norm.weight": 21.825292587280273, "gnorm/_forward_module.model.norm.weight": 0.007236046716570854, "pnorm/_forward_module.lm_head.weight": 224.94003295898438, "gnorm/_forward_module.lm_head.weight": 0.06637335568666458} +{"step": 1132462080, "pnorm/_forward_module.model.embeddings.weight": 102.96873474121094, "gnorm/_forward_module.model.embeddings.weight": 0.1559503972530365, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.891016006469727, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006524207070469856, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.580652713775635, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.019349709153175354, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.61602258682251, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01936171017587185, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.715234279632568, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2740393579006195, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.689601421356201, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.34906846284866333, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3143556118011475, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.023494109511375427, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17545589804649353, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0011950285406783223, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.861599922180176, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.009083227254450321, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.320484161376953, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2716090679168701, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.197393417358398, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.20954608917236328, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.182083129882812, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011893347837030888, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.180588722229004, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04656878113746643, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.40943717956543, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.08488254249095917, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.304362773895264, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.24403584003448486, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.852314472198486, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09432264417409897, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.400144100189209, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.014670162461698055, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18529288470745087, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002170954365283251, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.318035125732422, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003032667562365532, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.728605270385742, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09854593127965927, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.902604103088379, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07503321021795273, "pnorm/_forward_module.model.norm.weight": 21.886613845825195, "gnorm/_forward_module.model.norm.weight": 0.008443300612270832, "pnorm/_forward_module.lm_head.weight": 225.6268310546875, "gnorm/_forward_module.lm_head.weight": 0.04605492204427719} +{"step": 1153433600, "pnorm/_forward_module.model.embeddings.weight": 103.05156707763672, "gnorm/_forward_module.model.embeddings.weight": 0.15612585842609406, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.887085914611816, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005690640304237604, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.577545642852783, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.019731732085347176, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.613215923309326, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02038753405213356, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.711871147155762, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.25409597158432007, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.6867146492004395, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.36728808283805847, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3163366317749023, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.027507316321134567, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17658911645412445, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003947122022509575, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.857359886169434, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.01193216722458601, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.313462257385254, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.3103000521659851, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.192708015441895, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.22500650584697723, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.195913314819336, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.013041067868471146, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.211369514465332, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.05031440779566765, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.428926467895508, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.1023046001791954, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.3117995262146, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.25228843092918396, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.868342399597168, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09457438439130783, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.406120538711548, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012620446272194386, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18558672070503235, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0019405399216338992, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.319320678710938, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004176177550107241, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.733711242675781, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.11090563237667084, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.90592098236084, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.0983099490404129, "pnorm/_forward_module.model.norm.weight": 21.9486083984375, "gnorm/_forward_module.model.norm.weight": 0.009846093133091927, "pnorm/_forward_module.lm_head.weight": 226.3025665283203, "gnorm/_forward_module.lm_head.weight": 0.04789487645030022} +{"step": 1174405120, "pnorm/_forward_module.model.embeddings.weight": 103.12883758544922, "gnorm/_forward_module.model.embeddings.weight": 0.1419372856616974, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.882060050964355, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004603615030646324, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.57413387298584, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02060488611459732, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.610162734985352, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.020885169506072998, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.707035064697266, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.23249083757400513, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.682711124420166, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3073248565196991, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3179574012756348, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.027879172936081886, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17760980129241943, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.004162895958870649, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.852840423583984, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00832675863057375, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.306145668029785, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.22844967246055603, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.18792724609375, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.17454609274864197, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.210737228393555, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.009796379134058952, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.24425983428955, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03274759277701378, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.449557304382324, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07083716988563538, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.319865703582764, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.21880894899368286, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.8865966796875, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07812142372131348, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4112679958343506, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009993444196879864, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18579219281673431, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0012294561602175236, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.32147979736328, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0026528793387115, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.739933967590332, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.08591285347938538, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.90973949432373, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06370110809803009, "pnorm/_forward_module.model.norm.weight": 22.009458541870117, "gnorm/_forward_module.model.norm.weight": 0.009036269970238209, "pnorm/_forward_module.lm_head.weight": 226.94886779785156, "gnorm/_forward_module.lm_head.weight": 0.035757578909397125} +{"step": 1195376640, "pnorm/_forward_module.model.embeddings.weight": 103.19895935058594, "gnorm/_forward_module.model.embeddings.weight": 0.16093100607395172, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.877222061157227, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006269319914281368, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.572152137756348, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.018578652292490005, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.608633995056152, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.018949970602989197, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.702335357666016, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.26014575362205505, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.6786885261535645, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.33814769983291626, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3184679746627808, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.022070901468396187, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17837318778038025, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0020456404890865088, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.848605155944824, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006695673801004887, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.298807144165039, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.23964223265647888, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.18325138092041, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.17733590304851532, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.224916458129883, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011893405579030514, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.275548934936523, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.028149278834462166, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.469714164733887, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07910269498825073, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.327990531921387, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.24589358270168304, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.904640197753906, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09415413439273834, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.415374994277954, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.019618002697825432, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18593524396419525, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.002801851835101843, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.323211669921875, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.005821161903440952, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.745012283325195, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1347561776638031, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.913315773010254, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.11064442992210388, "pnorm/_forward_module.model.norm.weight": 22.06816291809082, "gnorm/_forward_module.model.norm.weight": 0.006446824874728918, "pnorm/_forward_module.lm_head.weight": 227.56544494628906, "gnorm/_forward_module.lm_head.weight": 0.0471494160592556} +{"step": 1216348160, "pnorm/_forward_module.model.embeddings.weight": 103.26216888427734, "gnorm/_forward_module.model.embeddings.weight": 0.19601409137248993, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.8729887008667, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007215828634798527, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.570526123046875, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.023747660219669342, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.607322692871094, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.025270530954003334, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.698278427124023, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.320171594619751, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.67525053024292, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.4679461121559143, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3188250064849854, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02741520293056965, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.179341122508049, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003914662171155214, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.844778060913086, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.014351869933307171, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.292243957519531, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.4204959571361542, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.179034233093262, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.32159262895584106, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.237720489501953, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.021546470001339912, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.304530143737793, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.09579608589410782, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.489147186279297, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.17237195372581482, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.335422039031982, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2735241949558258, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.920714378356934, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0979171022772789, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.420008897781372, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.018355663865804672, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18619240820407867, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.003074074862524867, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.32512664794922, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003838050877675414, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.750205993652344, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1107853353023529, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.916634559631348, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08300897479057312, "pnorm/_forward_module.model.norm.weight": 22.124393463134766, "gnorm/_forward_module.model.norm.weight": 0.007111974060535431, "pnorm/_forward_module.lm_head.weight": 228.14535522460938, "gnorm/_forward_module.lm_head.weight": 0.04150500148534775} +{"step": 1237319680, "pnorm/_forward_module.model.embeddings.weight": 103.31824493408203, "gnorm/_forward_module.model.embeddings.weight": 0.15249167382717133, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.869119644165039, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006876194849610329, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.568167209625244, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01645149663090706, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.605643272399902, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.017080901190638542, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.694540023803711, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.24109135568141937, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.672000408172607, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.36109429597854614, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3198449611663818, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.019504515454173088, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18012242019176483, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0027866805903613567, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.840935707092285, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.012135584838688374, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.2854585647583, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.3442583680152893, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.174476623535156, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.2656064033508301, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.24998664855957, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.01733585260808468, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.331338882446289, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.07172558456659317, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.507621765136719, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.13316218554973602, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.342595100402832, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.21800048649311066, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.936733245849609, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07379638403654099, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4240949153900146, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010898538865149021, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1863754689693451, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001796509837731719, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.32634735107422, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.002839357126504183, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.754243850708008, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.07558145374059677, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.919464111328125, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.062425851821899414, "pnorm/_forward_module.model.norm.weight": 22.178955078125, "gnorm/_forward_module.model.norm.weight": 0.007467786315828562, "pnorm/_forward_module.lm_head.weight": 228.6957550048828, "gnorm/_forward_module.lm_head.weight": 0.035143572837114334} +{"step": 1258291200, "pnorm/_forward_module.model.embeddings.weight": 103.36893463134766, "gnorm/_forward_module.model.embeddings.weight": 0.14352966845035553, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.865558624267578, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.00502143707126379, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.566891193389893, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.02071889117360115, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.604807376861572, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02091052196919918, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.690766334533691, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.22926384210586548, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.6687469482421875, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.29022783041000366, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.32082998752594, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.025885822251439095, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1808725893497467, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003613131120800972, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.83779239654541, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0060124825686216354, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.279828071594238, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.21348963677883148, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.1707124710083, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.161245197057724, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.260967254638672, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008908871561288834, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.355624198913574, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03161897882819176, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.523452758789062, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06385143101215363, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.3487396240234375, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2205084264278412, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.950456142425537, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08015234023332596, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.427696943283081, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006300655659288168, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18649475276470184, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00040469635860063136, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.32834243774414, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0026505901478230953, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.759354591369629, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.08500144630670547, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.922650337219238, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06311117112636566, "pnorm/_forward_module.model.norm.weight": 22.230451583862305, "gnorm/_forward_module.model.norm.weight": 0.006500617600977421, "pnorm/_forward_module.lm_head.weight": 229.21385192871094, "gnorm/_forward_module.lm_head.weight": 0.039977580308914185} +{"step": 1279262720, "pnorm/_forward_module.model.embeddings.weight": 103.41455841064453, "gnorm/_forward_module.model.embeddings.weight": 0.13748426735401154, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.86188793182373, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005250784568488598, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.56583309173584, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.020701607689261436, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.603922367095947, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02087489701807499, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.686760902404785, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.20490768551826477, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.665287017822266, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.26018866896629333, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3220733404159546, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02551218681037426, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18181048333644867, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0037790341302752495, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.834797859191895, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005377883091568947, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.274264335632324, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.16394460201263428, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.167102813720703, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.13686539232730865, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.2718448638916, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008800630457699299, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.377555847167969, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02308077923953533, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.537931442260742, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05162964388728142, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.354912757873535, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2003174126148224, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.964374542236328, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07412245124578476, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4310851097106934, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011934884823858738, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18663400411605835, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0015136540168896317, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.329374313354492, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003168401075527072, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.76226806640625, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0838395208120346, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.924881935119629, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06583615392446518, "pnorm/_forward_module.model.norm.weight": 22.279447555541992, "gnorm/_forward_module.model.norm.weight": 0.008883069269359112, "pnorm/_forward_module.lm_head.weight": 229.70384216308594, "gnorm/_forward_module.lm_head.weight": 0.033489953726530075} +{"step": 1300234240, "pnorm/_forward_module.model.embeddings.weight": 103.45471954345703, "gnorm/_forward_module.model.embeddings.weight": 0.15033960342407227, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.8580322265625, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005870689172297716, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.564122676849365, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.019962724298238754, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.602458953857422, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02021753042936325, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.682665824890137, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.2305290848016739, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.661867141723633, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.36229875683784485, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3226439952850342, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02106187306344509, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18249118328094482, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0012514633126556873, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.831594467163086, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.014070906676352024, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.268144607543945, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.36959943175315857, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.16309642791748, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.2901357412338257, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.28300666809082, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.017655396834015846, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.400958061218262, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.07704304158687592, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.553579330444336, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.1399868279695511, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.361227035522461, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.19879098236560822, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.979035377502441, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07693865150213242, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4343628883361816, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.014110813848674297, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18670450150966644, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0024068886414170265, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.330856323242188, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0026676191482692957, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.765628814697266, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.08448169380426407, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.927449226379395, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06665822118520737, "pnorm/_forward_module.model.norm.weight": 22.327350616455078, "gnorm/_forward_module.model.norm.weight": 0.007984393276274204, "pnorm/_forward_module.lm_head.weight": 230.17123413085938, "gnorm/_forward_module.lm_head.weight": 0.044610850512981415} +{"step": 1321205760, "pnorm/_forward_module.model.embeddings.weight": 103.48985290527344, "gnorm/_forward_module.model.embeddings.weight": 0.1586742103099823, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.85388469696045, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006464751437306404, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.56231164932251, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01832476072013378, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.601312637329102, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01903490535914898, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.678244113922119, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.28747138381004333, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.6582255363464355, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.49656909704208374, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.323103666305542, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.026671158149838448, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1831931620836258, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002754044719040394, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.82864761352539, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.02379566803574562, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.262556076049805, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.5773115158081055, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.159608840942383, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.4283662736415863, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.29410743713379, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.024433070793747902, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.423687934875488, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.12433648854494095, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.568618774414062, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.21799218654632568, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.367772102355957, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.20335431396961212, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 6.993528842926025, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07518452405929565, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4371323585510254, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.021668538451194763, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18678848445415497, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.004277435131371021, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.332304000854492, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003717394545674324, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.7689790725708, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09699637442827225, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.929998397827148, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07333303242921829, "pnorm/_forward_module.model.norm.weight": 22.37343406677246, "gnorm/_forward_module.model.norm.weight": 0.005692000966519117, "pnorm/_forward_module.lm_head.weight": 230.6092071533203, "gnorm/_forward_module.lm_head.weight": 0.0297714713960886} +{"step": 1342177280, "pnorm/_forward_module.model.embeddings.weight": 103.51941680908203, "gnorm/_forward_module.model.embeddings.weight": 0.16822947561740875, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.851408958435059, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005851294379681349, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.56101655960083, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.020686153322458267, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.600114345550537, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02084706351161003, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.675510406494141, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.25735533237457275, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.655917644500732, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.34024468064308167, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3244823217391968, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.025718094781041145, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18398837745189667, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002761024981737137, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.82624626159668, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007572263013571501, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.257830619812012, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2481101006269455, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.156598091125488, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.19244922697544098, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.30255699157715, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010616090148687363, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.442078590393066, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03832879662513733, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.581401824951172, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07564505189657211, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.373035430908203, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2591738998889923, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.004525184631348, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08417227864265442, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4397401809692383, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00829541590064764, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18690553307533264, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010298852575942874, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.333314895629883, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0032268352806568146, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.771574974060059, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09218841046094894, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.932046890258789, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07652594149112701, "pnorm/_forward_module.model.norm.weight": 22.41591453552246, "gnorm/_forward_module.model.norm.weight": 0.006820782087743282, "pnorm/_forward_module.lm_head.weight": 231.01124572753906, "gnorm/_forward_module.lm_head.weight": 0.04575946554541588} +{"step": 1363148800, "pnorm/_forward_module.model.embeddings.weight": 103.54528045654297, "gnorm/_forward_module.model.embeddings.weight": 0.14128082990646362, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.848926544189453, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0052526528015732765, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.560962200164795, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.019917353987693787, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.600122928619385, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.02231457643210888, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.672504425048828, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.22419096529483795, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.65335750579834, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2977455258369446, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3254438638687134, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.021818142384290695, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1846310794353485, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002565359231084585, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.823596000671387, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00634473143145442, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.252552032470703, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.20930215716362, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.15310287475586, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15206699073314667, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.31096649169922, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008776779286563396, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.460115432739258, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02610628493130207, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.593746185302734, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05865367129445076, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.377760887145996, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.22314226627349854, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.01533317565918, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08555981516838074, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4421350955963135, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010611223056912422, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18695023655891418, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0013683864381164312, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.334497451782227, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003641231684014201, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.77426815032959, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.10380375385284424, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.93415355682373, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.07822990417480469, "pnorm/_forward_module.model.norm.weight": 22.45749282836914, "gnorm/_forward_module.model.norm.weight": 0.008198102936148643, "pnorm/_forward_module.lm_head.weight": 231.3944091796875, "gnorm/_forward_module.lm_head.weight": 0.041718531399965286} +{"step": 1384120320, "pnorm/_forward_module.model.embeddings.weight": 103.56842803955078, "gnorm/_forward_module.model.embeddings.weight": 0.13557375967502594, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.845648765563965, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004298862535506487, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.5607476234436035, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.014816117472946644, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.600045204162598, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.014881886541843414, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.668822288513184, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.19502411782741547, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.650341033935547, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.27465611696243286, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3259904384613037, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.016865991055965424, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18528246879577637, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001418713596649468, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.821333885192871, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0084452573210001, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.248059272766113, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.24259352684020996, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.1503267288208, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.19863130152225494, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.320009231567383, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.012293925508856773, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.478866577148438, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.05207940563559532, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.606327056884766, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.09338826686143875, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.382414817810059, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.20947520434856415, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.026443958282471, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06892281025648117, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4450769424438477, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009700861759483814, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18709808588027954, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0013212142512202263, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.33608627319336, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00239845784381032, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.777440071105957, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.07506823539733887, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.936391830444336, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05984492227435112, "pnorm/_forward_module.model.norm.weight": 22.497509002685547, "gnorm/_forward_module.model.norm.weight": 0.0077978926710784435, "pnorm/_forward_module.lm_head.weight": 231.75697326660156, "gnorm/_forward_module.lm_head.weight": 0.029190029948949814} +{"step": 1405091840, "pnorm/_forward_module.model.embeddings.weight": 103.58805847167969, "gnorm/_forward_module.model.embeddings.weight": 0.17704421281814575, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.842683792114258, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006420380901545286, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.560210227966309, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.018901217728853226, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599890232086182, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01837713085114956, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.665344715118408, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.3167971670627594, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.647508144378662, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.49669933319091797, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3270504474639893, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.029238147661089897, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18592149019241333, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.004887334071099758, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.818957328796387, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.021291321143507957, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.24338436126709, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.544226348400116, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.147345542907715, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.4028022587299347, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.328561782836914, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.023983793333172798, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.496402740478516, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.1121957078576088, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.618083000183105, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.1950874626636505, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.386783123016357, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2843721807003021, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.036723613739014, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08969858288764954, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4479660987854004, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.015464667230844498, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1871987283229828, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0028115359600633383, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.337688446044922, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004104888066649437, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.780496597290039, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.11050142347812653, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.938718795776367, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08558712154626846, "pnorm/_forward_module.model.norm.weight": 22.535064697265625, "gnorm/_forward_module.model.norm.weight": 0.005655745044350624, "pnorm/_forward_module.lm_head.weight": 232.09449768066406, "gnorm/_forward_module.lm_head.weight": 0.03340433910489082} +{"step": 1426063360, "pnorm/_forward_module.model.embeddings.weight": 103.60350036621094, "gnorm/_forward_module.model.embeddings.weight": 0.16416800022125244, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.840807914733887, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005914686713367701, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.5612335205078125, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.020249679684638977, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.601037979125977, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.019892461597919464, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.662753105163574, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.265220046043396, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.64537239074707, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.42388075590133667, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3276550769805908, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.027132300660014153, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1863531917333603, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.007123937364667654, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.817111015319824, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.020887959748506546, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.239326477050781, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.49789759516716003, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.144819259643555, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.38030514121055603, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.33542251586914, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.021844755858182907, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.51185417175293, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.10156505554914474, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.628463745117188, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.1782480627298355, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.390498161315918, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.19144092500209808, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.045079708099365, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.076044462621212, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4502856731414795, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.022066041827201843, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18725645542144775, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.003930238541215658, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.338775634765625, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003231297479942441, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.782342910766602, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.08561021089553833, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.940041542053223, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06499693542718887, "pnorm/_forward_module.model.norm.weight": 22.570106506347656, "gnorm/_forward_module.model.norm.weight": 0.005072867032140493, "pnorm/_forward_module.lm_head.weight": 232.4082794189453, "gnorm/_forward_module.lm_head.weight": 0.043206341564655304} +{"step": 1447034880, "pnorm/_forward_module.model.embeddings.weight": 103.61613464355469, "gnorm/_forward_module.model.embeddings.weight": 0.12512144446372986, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.838805198669434, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004742810036987066, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.561387062072754, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01727762073278427, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.601365566253662, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01790538802742958, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.660131454467773, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.21169213950634003, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.643191337585449, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.29025211930274963, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.328096866607666, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.022128432989120483, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18677259981632233, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0026070408057421446, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.815399169921875, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007420595269650221, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.235641479492188, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2268880307674408, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.142457008361816, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.17601577937602997, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.34183120727539, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010919305495917797, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.525917053222656, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04469917714595795, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.638195037841797, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.08313810080289841, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.394107341766357, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1991826891899109, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.053093910217285, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07231798768043518, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4527997970581055, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010432718321681023, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18736669421195984, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0016865036450326443, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.34016227722168, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0024731147568672895, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.784544944763184, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0804297924041748, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.941776275634766, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.0575227215886116, "pnorm/_forward_module.model.norm.weight": 22.603429794311523, "gnorm/_forward_module.model.norm.weight": 0.006987396162003279, "pnorm/_forward_module.lm_head.weight": 232.70248413085938, "gnorm/_forward_module.lm_head.weight": 0.03319404274225235} +{"step": 1468006400, "pnorm/_forward_module.model.embeddings.weight": 103.6266098022461, "gnorm/_forward_module.model.embeddings.weight": 0.11784376204013824, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.836238861083984, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004072663839906454, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.5607476234436035, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.017260944470763206, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.600841999053955, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01741897128522396, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.65709114074707, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1874098777770996, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.640650749206543, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2448129653930664, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3283668756484985, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.018170541152358055, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18723586201667786, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001684588030911982, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.813399314880371, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0047325328923761845, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.231249809265137, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.166819229722023, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.139720916748047, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.13513806462287903, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.349088668823242, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.00793854147195816, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.541336059570312, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.025467896834015846, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.648601531982422, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.04703597351908684, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.398200988769531, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.18604539334774017, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.062459945678711, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06106140464544296, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.455012321472168, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006485276389867067, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18737132847309113, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0004363137704785913, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.341188430786133, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0018950958037748933, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.786161422729492, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06448686122894287, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.943428993225098, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05087348073720932, "pnorm/_forward_module.model.norm.weight": 22.635757446289062, "gnorm/_forward_module.model.norm.weight": 0.006375256460160017, "pnorm/_forward_module.lm_head.weight": 232.97764587402344, "gnorm/_forward_module.lm_head.weight": 0.02495025098323822} +{"step": 1488977920, "pnorm/_forward_module.model.embeddings.weight": 103.63502502441406, "gnorm/_forward_module.model.embeddings.weight": 0.12525595724582672, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.833374977111816, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005176465958356857, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.5603928565979, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.016463054344058037, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.6004509925842285, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01690272055566311, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.653639316558838, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.21497507393360138, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.637834548950195, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.3260742425918579, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3284919261932373, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.02111223340034485, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18760965764522552, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0019422966288402677, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.81173324584961, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.010615016333758831, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.227628707885742, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.3044610023498535, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.1375093460083, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.20368997752666473, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.356645584106445, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.011516961269080639, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.557284355163574, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.044732850044965744, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.659327507019043, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.09295615553855896, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.402152061462402, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1934778392314911, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.071730613708496, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08009745180606842, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.457697629928589, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007293563801795244, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18748490512371063, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009519033483229578, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.342748641967773, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00477993069216609, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.788857460021973, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.11772514879703522, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.945601463317871, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.09803363680839539, "pnorm/_forward_module.model.norm.weight": 22.666662216186523, "gnorm/_forward_module.model.norm.weight": 0.005294814705848694, "pnorm/_forward_module.lm_head.weight": 233.23519897460938, "gnorm/_forward_module.lm_head.weight": 0.03734014183282852} +{"step": 1509949440, "pnorm/_forward_module.model.embeddings.weight": 103.64129638671875, "gnorm/_forward_module.model.embeddings.weight": 0.10579540580511093, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.830851554870605, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003935625310987234, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.559988975524902, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.015345334075391293, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599981784820557, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.015547151677310467, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.65059757232666, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.17402905225753784, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.635280609130859, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.23006419837474823, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3289215564727783, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.018574625253677368, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.187955841422081, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0017780576599761844, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.809961318969727, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005724478978663683, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.224075317382812, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.18553602695465088, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.135333061218262, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.13858506083488464, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.363773345947266, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007398849818855524, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.572041511535645, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02210908755660057, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.669160842895508, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.04836197569966316, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.405993938446045, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.17138950526714325, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.080352783203125, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06152492016553879, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.459491491317749, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008609456941485405, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1874905526638031, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007781560416333377, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.343778610229492, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0023288200609385967, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.79022216796875, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0680466890335083, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.947285652160645, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05617404356598854, "pnorm/_forward_module.model.norm.weight": 22.695589065551758, "gnorm/_forward_module.model.norm.weight": 0.00654611736536026, "pnorm/_forward_module.lm_head.weight": 233.4747772216797, "gnorm/_forward_module.lm_head.weight": 0.03232118487358093} +{"step": 1530920960, "pnorm/_forward_module.model.embeddings.weight": 103.64556121826172, "gnorm/_forward_module.model.embeddings.weight": 0.1086268499493599, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.828282356262207, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004230834078043699, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.559451103210449, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.016282713040709496, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599457263946533, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.016602661460638046, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.647617340087891, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.181414395570755, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.63282585144043, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.24864788353443146, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3292105197906494, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.019502364099025726, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1884487271308899, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.004221632145345211, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.808218002319336, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00768211530521512, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.220748901367188, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.20852912962436676, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.133333206176758, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.159249410033226, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.371028900146484, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007205495145171881, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.587486267089844, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02962147630751133, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.679491996765137, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0605367049574852, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.409473896026611, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.16743820905685425, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.088663101196289, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06256183981895447, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4615681171417236, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010568764992058277, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18757808208465576, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0014556868700310588, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.345083236694336, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003678097389638424, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.792156219482422, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.08591969311237335, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.94920825958252, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.0739172175526619, "pnorm/_forward_module.model.norm.weight": 22.722774505615234, "gnorm/_forward_module.model.norm.weight": 0.00814593955874443, "pnorm/_forward_module.lm_head.weight": 233.69757080078125, "gnorm/_forward_module.lm_head.weight": 0.031673118472099304} +{"step": 1551892480, "pnorm/_forward_module.model.embeddings.weight": 103.64827728271484, "gnorm/_forward_module.model.embeddings.weight": 0.16430118680000305, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.826019287109375, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007219821680337191, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.55959939956665, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.019968243315815926, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599719047546387, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.019668856635689735, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.644834995269775, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.27380499243736267, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.630554676055908, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.4796332120895386, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3293273448944092, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.03253737464547157, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18879783153533936, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.008070887066423893, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.806293487548828, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.022655915468931198, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.217239379882812, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.5860257148742676, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.131220817565918, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.4501757025718689, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.378009796142578, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.02529250644147396, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.601571083068848, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.12166523933410645, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.68887710571289, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.22139380872249603, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.41292667388916, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.2025134563446045, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.096667766571045, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06709547340869904, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4638869762420654, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.02186456508934498, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18761523067951202, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0037069516256451607, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.346445083618164, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.002364952117204666, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.794303894042969, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.07117582112550735, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.951177597045898, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05694448947906494, "pnorm/_forward_module.model.norm.weight": 22.7482967376709, "gnorm/_forward_module.model.norm.weight": 0.006440988276153803, "pnorm/_forward_module.lm_head.weight": 233.9044647216797, "gnorm/_forward_module.lm_head.weight": 0.024666497483849525} +{"step": 1572864000, "pnorm/_forward_module.model.embeddings.weight": 103.6497573852539, "gnorm/_forward_module.model.embeddings.weight": 0.09805700927972794, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.823989868164062, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003167147282510996, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.559340953826904, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01673746295273304, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599497318267822, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.015958191826939583, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.642392158508301, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.15308383107185364, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.628547191619873, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.21155616641044617, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3295456171035767, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.017618736252188683, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18909648060798645, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0012121128384023905, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.805100440979004, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005179642699658871, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.214725494384766, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1794663965702057, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.129681587219238, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.12364791333675385, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.38364601135254, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0057256221771240234, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.613346099853516, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02637496218085289, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.697161674499512, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.045990683138370514, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.416005611419678, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1349646896123886, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.103688716888428, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05720939859747887, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.465440273284912, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0069733960554003716, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18765072524547577, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006730034365318716, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.347442626953125, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.002268356503918767, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.79577922821045, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06721076369285583, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.952780723571777, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05392918735742569, "pnorm/_forward_module.model.norm.weight": 22.772043228149414, "gnorm/_forward_module.model.norm.weight": 0.007601771969348192, "pnorm/_forward_module.lm_head.weight": 234.09507751464844, "gnorm/_forward_module.lm_head.weight": 0.03114181011915207} +{"step": 1593835520, "pnorm/_forward_module.model.embeddings.weight": 103.65003967285156, "gnorm/_forward_module.model.embeddings.weight": 0.0958380326628685, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.822370529174805, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0033017899841070175, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.5591959953308105, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.0152738681063056, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599560737609863, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.015353784896433353, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.640135288238525, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.15097065269947052, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.626777648925781, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.21059705317020416, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.330203652381897, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.018223002552986145, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1895022988319397, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007130720769055188, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.80378246307373, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005082730669528246, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.211938858032227, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1786067634820938, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.127945899963379, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.13163448870182037, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.388927459716797, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007344543002545834, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.624950408935547, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.029046395793557167, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.705277442932129, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05787286534905434, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.418876647949219, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.12829141318798065, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.109954357147217, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.054928310215473175, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4665684700012207, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0048477305099368095, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1876564621925354, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005236894357949495, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.34846305847168, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0027034906670451164, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.797205924987793, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06717079877853394, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.954315185546875, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.055271293967962265, "pnorm/_forward_module.model.norm.weight": 22.794109344482422, "gnorm/_forward_module.model.norm.weight": 0.005736818537116051, "pnorm/_forward_module.lm_head.weight": 234.2694549560547, "gnorm/_forward_module.lm_head.weight": 0.027238210663199425} +{"step": 1614807040, "pnorm/_forward_module.model.embeddings.weight": 103.64924621582031, "gnorm/_forward_module.model.embeddings.weight": 0.09193377941846848, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.820491790771484, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003798572113737464, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.5592498779296875, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013003661297261715, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599783420562744, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013490861281752586, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.637657165527344, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.15514330565929413, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.624810695648193, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.2248162031173706, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3303935527801514, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.014461982063949108, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18967504799365997, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0015853133518248796, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.802528381347656, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007419890724122524, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.20942211151123, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.2101893275976181, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.126388549804688, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15857809782028198, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.394420623779297, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008675494231283665, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.636146545410156, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.04257776215672493, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.713037490844727, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07534719258546829, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.421795845031738, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1400366723537445, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.11634635925293, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05608321353793144, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4684157371520996, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0074734860099852085, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1877119094133377, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0011649065418168902, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.349689483642578, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0023951921612024307, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.798954963684082, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06642499566078186, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.955988883972168, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05295579880475998, "pnorm/_forward_module.model.norm.weight": 22.814533233642578, "gnorm/_forward_module.model.norm.weight": 0.005907109938561916, "pnorm/_forward_module.lm_head.weight": 234.429443359375, "gnorm/_forward_module.lm_head.weight": 0.028542593121528625} +{"step": 1635778560, "pnorm/_forward_module.model.embeddings.weight": 103.64778137207031, "gnorm/_forward_module.model.embeddings.weight": 0.10122296214103699, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.818528175354004, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0032898082863539457, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.559138298034668, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.015721391886472702, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599639415740967, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.014972168952226639, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.635383605957031, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.15135514736175537, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.622967720031738, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.22233644127845764, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3303550481796265, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.017947262153029442, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18984106183052063, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0016764559550210834, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.801271438598633, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005142096430063248, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.206975936889648, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.17686349153518677, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.124924659729004, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.12599635124206543, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.40010643005371, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.007029845844954252, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.647396087646484, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.025393398478627205, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.720939636230469, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05388186126947403, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.42457914352417, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.15268011391162872, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.1225738525390625, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.054625801742076874, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4702341556549072, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00843189936131239, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18776527047157288, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001091569778509438, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.350820541381836, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00209855567663908, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.800604820251465, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06219576671719551, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.957615852355957, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.048637814819812775, "pnorm/_forward_module.model.norm.weight": 22.833410263061523, "gnorm/_forward_module.model.norm.weight": 0.006782717537134886, "pnorm/_forward_module.lm_head.weight": 234.5764923095703, "gnorm/_forward_module.lm_head.weight": 0.02602948434650898} +{"step": 1656750080, "pnorm/_forward_module.model.embeddings.weight": 103.64561462402344, "gnorm/_forward_module.model.embeddings.weight": 0.09801327437162399, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.816635131835938, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0034742476418614388, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.558749198913574, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.014659232459962368, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599276542663574, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.014489964582026005, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.633196830749512, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.15535297989845276, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.621146202087402, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.22696048021316528, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3305662870407104, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.018990658223628998, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19013448059558868, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003102600108832121, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.80008602142334, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007771460339426994, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.204726219177246, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.21997599303722382, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.123587608337402, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1731049120426178, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.405380249023438, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.010205782018601894, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.658011436462402, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.042745549231767654, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.72813606262207, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.08022693544626236, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.4272027015686035, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.13395126163959503, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.128351211547852, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04947648197412491, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4718940258026123, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00734157906845212, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18779070675373077, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001089382218196988, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.3515682220459, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0015134315472096205, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.80158519744873, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05429365858435631, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.958773612976074, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04362107440829277, "pnorm/_forward_module.model.norm.weight": 22.85065269470215, "gnorm/_forward_module.model.norm.weight": 0.006834856234490871, "pnorm/_forward_module.lm_head.weight": 234.70916748046875, "gnorm/_forward_module.lm_head.weight": 0.02419815957546234} +{"step": 1677721600, "pnorm/_forward_module.model.embeddings.weight": 103.6429443359375, "gnorm/_forward_module.model.embeddings.weight": 0.09187097102403641, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.815103530883789, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0030010004993528128, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.559019565582275, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013558976352214813, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599661350250244, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013690986670553684, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.631175518035889, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.14171825349330902, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.619488716125488, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.18821211159229279, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3308366537094116, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.015133887529373169, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1903391182422638, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0011568169575184584, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.798959732055664, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0041819303296506405, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.20253849029541, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.13850583136081696, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.122276306152344, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10360332578420639, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.409862518310547, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006079886574298143, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.666851997375488, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.019848648458719254, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.734127044677734, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.03596179187297821, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.429611682891846, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1346137374639511, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.133304595947266, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04960794374346733, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.472990036010742, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005139954853802919, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.187814861536026, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00042486831080168486, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.352628707885742, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0017375074094161391, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.802957534790039, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05495809391140938, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.960158348083496, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.044986542314291, "pnorm/_forward_module.model.norm.weight": 22.866422653198242, "gnorm/_forward_module.model.norm.weight": 0.00649062916636467, "pnorm/_forward_module.lm_head.weight": 234.8295135498047, "gnorm/_forward_module.lm_head.weight": 0.026147395372390747} +{"step": 1698693120, "pnorm/_forward_module.model.embeddings.weight": 103.63983154296875, "gnorm/_forward_module.model.embeddings.weight": 0.08289172500371933, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.813396453857422, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003003776539117098, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.558685302734375, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013911940157413483, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599318027496338, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013745193369686604, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.629131317138672, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13471059501171112, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.617843151092529, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.20130059123039246, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.330912709236145, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.017752328887581825, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19047978520393372, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002492929343134165, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.798121452331543, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0073151253163814545, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.200970649719238, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.20506422221660614, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.121321678161621, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15628722310066223, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.414316177368164, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008878161199390888, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.676041603088379, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.038248687982559204, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.740294456481934, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06784962862730026, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.431838035583496, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1065603494644165, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.137962818145752, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.043197136372327805, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4741923809051514, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006480084266513586, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18788619339466095, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.000964281614869833, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.353567123413086, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0014282825868576765, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.804049491882324, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04804975911974907, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.961276054382324, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04032106325030327, "pnorm/_forward_module.model.norm.weight": 22.880794525146484, "gnorm/_forward_module.model.norm.weight": 0.006505224853754044, "pnorm/_forward_module.lm_head.weight": 234.9394989013672, "gnorm/_forward_module.lm_head.weight": 0.02262982167303562} +{"step": 1719664640, "pnorm/_forward_module.model.embeddings.weight": 103.63652801513672, "gnorm/_forward_module.model.embeddings.weight": 0.08616653829813004, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.811971664428711, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002701058518141508, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.558584690093994, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013308503665030003, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599186897277832, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013127843849360943, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.627389907836914, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12540312111377716, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.616451263427734, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.17013874650001526, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3311599493026733, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.014318534173071384, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19065962731838226, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0008342101355083287, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.797104835510254, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003616980044171214, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.199065208435059, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.13767307996749878, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.120231628417969, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10778291523456573, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.4185733795166, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.005613071843981743, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.684854507446289, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.024209287017583847, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.746406555175781, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0433703288435936, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.433850288391113, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.10258380323648453, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.142332553863525, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04153333231806755, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4753644466400146, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004316170699894428, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18791471421718597, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0002737058384809643, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.354385375976562, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0017775761662051082, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.804932594299316, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05081931874155998, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.962381362915039, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04282099008560181, "pnorm/_forward_module.model.norm.weight": 22.893892288208008, "gnorm/_forward_module.model.norm.weight": 0.005970904603600502, "pnorm/_forward_module.lm_head.weight": 235.0376434326172, "gnorm/_forward_module.lm_head.weight": 0.02442299760878086} +{"step": 1740636160, "pnorm/_forward_module.model.embeddings.weight": 103.63321685791016, "gnorm/_forward_module.model.embeddings.weight": 0.08034508675336838, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.810554504394531, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002530248137190938, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.558361530303955, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01239805854856968, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.599029541015625, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012525309808552265, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.62567663192749, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12018518149852753, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.615057468414307, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.15995703637599945, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3314000368118286, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01195178646594286, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1908220499753952, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006484670448116958, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.796159744262695, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0033453747164458036, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.197332382202148, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.12037700414657593, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.11922836303711, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08912774175405502, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.42261505126953, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004254516214132309, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.692900657653809, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.016869550570845604, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.751951217651367, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0318860299885273, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.435722827911377, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09951648861169815, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.146397590637207, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04257418215274811, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4766547679901123, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004464464262127876, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18795162439346313, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005283643840812147, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.355382919311523, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0015788457822054625, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.806079864501953, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04727236554026604, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.963574409484863, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.040075283497571945, "pnorm/_forward_module.model.norm.weight": 22.905685424804688, "gnorm/_forward_module.model.norm.weight": 0.006686368957161903, "pnorm/_forward_module.lm_head.weight": 235.12477111816406, "gnorm/_forward_module.lm_head.weight": 0.022243894636631012} +{"step": 1761607680, "pnorm/_forward_module.model.embeddings.weight": 103.62994384765625, "gnorm/_forward_module.model.embeddings.weight": 0.08266036212444305, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.809195518493652, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003152470337226987, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.558084487915039, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013165130279958248, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.5987958908081055, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01368713565170765, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.624079704284668, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1293638050556183, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.613790988922119, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1849217414855957, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.331654667854309, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01663072407245636, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19101576507091522, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0023895364720374346, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.795228004455566, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005826966371387243, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.195724487304688, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.163690984249115, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.118237495422363, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1211843490600586, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.42642593383789, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.00721492525190115, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.700177192687988, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03144155442714691, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.756881713867188, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.056844890117645264, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.4376912117004395, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1133141741156578, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.150362968444824, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.045510418713092804, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.477745771408081, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0078266067430377, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18798556923866272, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010078295599669218, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.356163024902344, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0017375126481056213, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.806928634643555, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05109648033976555, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.964497566223145, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.042152296751737595, "pnorm/_forward_module.model.norm.weight": 22.91620445251465, "gnorm/_forward_module.model.norm.weight": 0.006398769095540047, "pnorm/_forward_module.lm_head.weight": 235.2021484375, "gnorm/_forward_module.lm_head.weight": 0.023734936490654945} +{"step": 1782579200, "pnorm/_forward_module.model.embeddings.weight": 103.62677764892578, "gnorm/_forward_module.model.embeddings.weight": 0.0847160667181015, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.807971000671387, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0026793747674673796, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.558176040649414, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.012961114756762981, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598979949951172, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013274489901959896, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.622526168823242, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12791311740875244, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.612554550170898, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.17203935980796814, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3315733671188354, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.013847611844539642, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19108846783638, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.000871551688760519, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.794537544250488, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004626357927918434, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.194501876831055, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.13932938873767853, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.11755657196045, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1003355085849762, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.429855346679688, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.005592132918536663, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.706772804260254, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.022851044312119484, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.76142692565918, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0442795492708683, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.43936014175415, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09573189914226532, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.153636932373047, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.042640820145606995, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4790565967559814, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00881385337561369, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18806280195713043, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0012303710682317615, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.356739044189453, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0012383325956761837, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.807534217834473, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.046876709908246994, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.965326309204102, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03693182393908501, "pnorm/_forward_module.model.norm.weight": 22.925498962402344, "gnorm/_forward_module.model.norm.weight": 0.00626926077529788, "pnorm/_forward_module.lm_head.weight": 235.2700653076172, "gnorm/_forward_module.lm_head.weight": 0.0240127295255661} +{"step": 1803550720, "pnorm/_forward_module.model.embeddings.weight": 103.62367248535156, "gnorm/_forward_module.model.embeddings.weight": 0.08644232153892517, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.80698013305664, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002761433832347393, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.558069229125977, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013077820651233196, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598880767822266, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012820112518966198, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.621295928955078, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12843948602676392, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.61152458190918, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.17604312300682068, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.331707239151001, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.014287545345723629, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19121916592121124, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.000933428353164345, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.793768882751465, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004748815204948187, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.19313907623291, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1508481800556183, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.116776466369629, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.11394888907670975, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.432863235473633, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.006815133150666952, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.712421417236328, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.02883102372288704, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.765277862548828, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05277755856513977, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.440796375274658, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.10800641775131226, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.156469345092773, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04153113439679146, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4798941612243652, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005681001581251621, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1880849152803421, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008494788780808449, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.357341766357422, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0015076639829203486, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.808149337768555, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04718569666147232, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.966064453125, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03755795583128929, "pnorm/_forward_module.model.norm.weight": 22.93364143371582, "gnorm/_forward_module.model.norm.weight": 0.006403029430657625, "pnorm/_forward_module.lm_head.weight": 235.32948303222656, "gnorm/_forward_module.lm_head.weight": 0.023064803332090378} +{"step": 1824522240, "pnorm/_forward_module.model.embeddings.weight": 103.62081909179688, "gnorm/_forward_module.model.embeddings.weight": 0.07803239673376083, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.806035041809082, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002666782820597291, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557705402374268, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.012948189862072468, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598526954650879, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013323846273124218, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.620242595672607, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11938741058111191, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.610647201538086, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1609494388103485, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3318712711334229, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.016500135883688927, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1913725882768631, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0027964776381850243, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.793034553527832, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002890744712203741, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.191913604736328, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.11881794035434723, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.116046905517578, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08902058750391006, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.43550682067871, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004788435064256191, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.717662811279297, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.020493770018219948, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.76888656616211, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.03535052016377449, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.442033290863037, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09765446931123734, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.158902168273926, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.042876023799180984, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.480487108230591, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0042640226893126965, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18808166682720184, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00037256808718666434, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.357912063598633, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013814476551488042, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.808700561523438, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.047342877835035324, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.96677017211914, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03658328950405121, "pnorm/_forward_module.model.norm.weight": 22.940780639648438, "gnorm/_forward_module.model.norm.weight": 0.006682043895125389, "pnorm/_forward_module.lm_head.weight": 235.38096618652344, "gnorm/_forward_module.lm_head.weight": 0.022811653092503548} +{"step": 1845493760, "pnorm/_forward_module.model.embeddings.weight": 103.61817932128906, "gnorm/_forward_module.model.embeddings.weight": 0.07782446593046188, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.80517578125, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002504454692825675, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557565689086914, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01340517494827509, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.59840726852417, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013442954048514366, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.619232177734375, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11152467131614685, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.60983419418335, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.14752493798732758, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3319146633148193, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.015880784019827843, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19147954881191254, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0012307025026530027, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.792512893676758, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003648537676781416, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.190972328186035, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.12230085581541061, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.115557670593262, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08983058482408524, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.43797492980957, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.00516059435904026, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.72248649597168, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.023044586181640625, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.772146224975586, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.04150168597698212, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.443068027496338, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08055692911148071, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.161004066467285, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03693225607275963, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.481091022491455, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005622027907520533, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.188117116689682, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007476043538190424, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.358348846435547, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00130397395696491, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.809069633483887, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04187049716711044, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.967272758483887, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.033786118030548096, "pnorm/_forward_module.model.norm.weight": 22.946928024291992, "gnorm/_forward_module.model.norm.weight": 0.006158347241580486, "pnorm/_forward_module.lm_head.weight": 235.42483520507812, "gnorm/_forward_module.lm_head.weight": 0.021438485011458397} +{"step": 1866465280, "pnorm/_forward_module.model.embeddings.weight": 103.61577606201172, "gnorm/_forward_module.model.embeddings.weight": 0.07993628829717636, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.80434799194336, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0028138342313468456, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557397842407227, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01365803461521864, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598264694213867, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013576099649071693, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.618297100067139, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12042068690061569, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.6091084480285645, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.18978054821491241, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3319056034088135, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.016098957508802414, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19156663119792938, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0025589519646018744, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.792078018188477, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.007882699370384216, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.190186500549316, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.20941570401191711, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.115150451660156, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.15496282279491425, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.440139770507812, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0076581635512411594, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.726560592651367, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.041278716176748276, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.774916648864746, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.07510469108819962, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.444084167480469, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.07891583442687988, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.163023471832275, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03708941861987114, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4814817905426025, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005036697257310152, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18812641501426697, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007062721415422857, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.358779907226562, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001254483824595809, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.809483528137207, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.041627656668424606, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.967809677124023, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03262992948293686, "pnorm/_forward_module.model.norm.weight": 22.952165603637695, "gnorm/_forward_module.model.norm.weight": 0.005531106609851122, "pnorm/_forward_module.lm_head.weight": 235.46200561523438, "gnorm/_forward_module.lm_head.weight": 0.020652441307902336} +{"step": 1887436800, "pnorm/_forward_module.model.embeddings.weight": 103.61371612548828, "gnorm/_forward_module.model.embeddings.weight": 0.0787525326013565, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.80378532409668, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0024498358834534883, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557407855987549, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.012692399322986603, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.5982584953308105, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012882711365818977, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.617563724517822, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11572905629873276, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.608509063720703, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.15476872026920319, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3319425582885742, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.014326036907732487, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19161003828048706, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007170020253397524, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.79173469543457, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0031240398529917, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.1895751953125, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.11988247185945511, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.114808082580566, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08904101699590683, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.441884994506836, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0050697787664830685, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.729778289794922, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.022221332415938377, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.777134895324707, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.041143305599689484, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.444902420043945, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.0933777242898941, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.164642810821533, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03839138522744179, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.481956958770752, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005228027235716581, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18815241754055023, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005156538682058454, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.35912322998047, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0014631884405389428, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.809797286987305, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04516245424747467, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.968246459960938, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.037380848079919815, "pnorm/_forward_module.model.norm.weight": 22.95655059814453, "gnorm/_forward_module.model.norm.weight": 0.006503382232040167, "pnorm/_forward_module.lm_head.weight": 235.49302673339844, "gnorm/_forward_module.lm_head.weight": 0.022430241107940674} +{"step": 1908408320, "pnorm/_forward_module.model.embeddings.weight": 103.6119155883789, "gnorm/_forward_module.model.embeddings.weight": 0.061752982437610626, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.803231239318848, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0020764567889273167, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557451248168945, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011782156303524971, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598294734954834, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012392633594572544, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.616882801055908, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09227524697780609, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.607944488525391, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12089045345783234, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.331900954246521, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01159113459289074, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1916564553976059, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0011744694784283638, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.791444778442383, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002493108855560422, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.189059257507324, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0896855816245079, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.114507675170898, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06430362164974213, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.443410873413086, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.003353995969519019, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.732558250427246, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.01345506589859724, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.778997421264648, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.02587984688580036, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.445626258850098, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06454334408044815, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.1660308837890625, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.032875481992959976, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4823720455169678, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005053477827459574, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18816712498664856, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005001687677577138, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.359472274780273, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001185974688269198, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.810127258300781, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.036627646535634995, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.968609809875488, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.030427148565649986, "pnorm/_forward_module.model.norm.weight": 22.960145950317383, "gnorm/_forward_module.model.norm.weight": 0.006145712919533253, "pnorm/_forward_module.lm_head.weight": 235.5187530517578, "gnorm/_forward_module.lm_head.weight": 0.02056948095560074} +{"step": 1929379840, "pnorm/_forward_module.model.embeddings.weight": 103.6103744506836, "gnorm/_forward_module.model.embeddings.weight": 0.06788242608308792, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.802740097045898, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0021449842024594545, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557320594787598, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01187476608902216, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598170280456543, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012246665544807911, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.616305828094482, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.0958091989159584, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.607468605041504, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1319165974855423, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3320486545562744, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.011768150143325329, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19176477193832397, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00030450208578258753, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.791230201721191, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003427924122661352, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.188652992248535, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1199691891670227, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.114307403564453, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08750760555267334, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.444684982299805, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0047916523180902, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.734820365905762, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.019402174279093742, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.780488967895508, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.03621654585003853, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.446170806884766, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06654224544763565, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.16710090637207, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.033729538321495056, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4825797080993652, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004324703477323055, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18817363679409027, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005243554478511214, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.35974884033203, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001221492770127952, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.810369491577148, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03856181353330612, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.968941688537598, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.031818464398384094, "pnorm/_forward_module.model.norm.weight": 22.96303367614746, "gnorm/_forward_module.model.norm.weight": 0.0060455938801169395, "pnorm/_forward_module.lm_head.weight": 235.5392303466797, "gnorm/_forward_module.lm_head.weight": 0.02077570930123329} +{"step": 1950351360, "pnorm/_forward_module.model.embeddings.weight": 103.609130859375, "gnorm/_forward_module.model.embeddings.weight": 0.06362485885620117, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.802382469177246, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0020950071047991514, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557267189025879, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011534147895872593, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598130702972412, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012018506415188313, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.615850448608398, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08980406820774078, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.6070942878723145, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12246444821357727, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.332080364227295, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.012496226467192173, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19180820882320404, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007995864725671709, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.790985107421875, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0024598930031061172, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.188214302062988, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.09649191051721573, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.114068031311035, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06951024383306503, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.445756912231445, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0035883313976228237, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.73677921295166, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.01609273999929428, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.781790733337402, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.029874471947550774, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.446654319763184, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06144121661782265, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.168004035949707, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.032119497656822205, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4828810691833496, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005290155299007893, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1881972998380661, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005442544352263212, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.359966278076172, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010231242049485445, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.810540199279785, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03668520972132683, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.9691743850708, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.028973815962672234, "pnorm/_forward_module.model.norm.weight": 22.96527862548828, "gnorm/_forward_module.model.norm.weight": 0.006038870196789503, "pnorm/_forward_module.lm_head.weight": 235.55499267578125, "gnorm/_forward_module.lm_head.weight": 0.020749123767018318} +{"step": 1971322880, "pnorm/_forward_module.model.embeddings.weight": 103.60818481445312, "gnorm/_forward_module.model.embeddings.weight": 0.06367079168558121, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.802074432373047, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002049713861197233, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557317733764648, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011804136447608471, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598179817199707, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012074603699147701, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.6154704093933105, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09126151353120804, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.606782913208008, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12453517317771912, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.332045316696167, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.015808310359716415, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19182562828063965, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0029649233911186457, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.790837287902832, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0029986808076500893, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.187959671020508, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.10028615593910217, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.113934516906738, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06933270394802094, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.446596145629883, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004000569693744183, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.738297462463379, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.014700490050017834, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.782800674438477, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.029828723520040512, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.447035789489746, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.062005415558815, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.168659687042236, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03214764967560768, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.483037233352661, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0042708683758974075, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18819640576839447, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00044440108467824757, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.360212326049805, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0011343928053975105, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.810802459716797, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03723415359854698, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.969441413879395, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03177588805556297, "pnorm/_forward_module.model.norm.weight": 22.96697998046875, "gnorm/_forward_module.model.norm.weight": 0.006746912375092506, "pnorm/_forward_module.lm_head.weight": 235.56678771972656, "gnorm/_forward_module.lm_head.weight": 0.020924439653754234} +{"step": 1992294400, "pnorm/_forward_module.model.embeddings.weight": 103.60746765136719, "gnorm/_forward_module.model.embeddings.weight": 0.06185843050479889, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.801860809326172, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001964928349480033, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557354927062988, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011929317377507687, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598210334777832, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012258412316441536, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.615202903747559, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09008197486400604, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.606560707092285, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12035718560218811, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.332025170326233, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.013991420157253742, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19182667136192322, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001999249681830406, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.790685653686523, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002769144019111991, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.187726020812988, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.09968473017215729, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.11380386352539, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07081795483827591, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.44725227355957, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004237509798258543, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.73953914642334, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.016942961141467094, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.7836275100708, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.032075539231300354, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.447328567504883, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.0639648288488388, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.169172763824463, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.031597595661878586, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4831483364105225, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0039054376538842916, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18819762766361237, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0004114302573725581, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.36037254333496, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010444376384839416, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.810956001281738, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0369117334485054, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.969614028930664, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03093593195080757, "pnorm/_forward_module.model.norm.weight": 22.96820831298828, "gnorm/_forward_module.model.norm.weight": 0.006395900622010231, "pnorm/_forward_module.lm_head.weight": 235.57528686523438, "gnorm/_forward_module.lm_head.weight": 0.02055657096207142} +{"step": 2013265920, "pnorm/_forward_module.model.embeddings.weight": 103.60697937011719, "gnorm/_forward_module.model.embeddings.weight": 0.06173189729452133, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.80172061920166, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0019464956130832434, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557343482971191, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01215201523154974, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598194122314453, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012105308473110199, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.615021228790283, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08785746991634369, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.606417179107666, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11702943593263626, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3320688009262085, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.012953666038811207, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19185538589954376, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0013159217778593302, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.790581703186035, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0023759861942380667, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.187564849853516, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0904967188835144, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.113709449768066, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05973035469651222, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.44770622253418, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0035065789707005024, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.740375518798828, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.012835507281124592, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.784173965454102, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.026509685441851616, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.447534084320068, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.05482323095202446, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.169529914855957, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.031052442267537117, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4832701683044434, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005124165676534176, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18820630013942719, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00047484718379564583, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.36043357849121, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001084566698409617, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.810989379882812, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.036283109337091446, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.969686508178711, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.030498795211315155, "pnorm/_forward_module.model.norm.weight": 22.96903419494629, "gnorm/_forward_module.model.norm.weight": 0.006242914590984583, "pnorm/_forward_module.lm_head.weight": 235.58099365234375, "gnorm/_forward_module.lm_head.weight": 0.020710887387394905} +{"step": 2034237440, "pnorm/_forward_module.model.embeddings.weight": 103.60668182373047, "gnorm/_forward_module.model.embeddings.weight": 0.06604793667793274, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.801628112792969, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0020495743956416845, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557305812835693, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011613378301262856, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.5981526374816895, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012156281620264053, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.614917755126953, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09165652841329575, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.606335639953613, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1242016851902008, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3320958614349365, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.014143591746687889, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19187231361865997, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001801070524379611, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.790509223937988, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0035877081099897623, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.187442779541016, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.11307763308286667, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.113639831542969, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08118963986635208, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.447982788085938, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.00433358084410429, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.740877151489258, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.019295724108815193, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.784507751464844, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.033615726977586746, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.447664260864258, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.054559558629989624, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.169749736785889, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03081623837351799, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.483349561691284, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006042997352778912, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18821482360363007, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008770317072048783, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.360490798950195, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0009369998006150126, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.811039924621582, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03558822721242905, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.969745635986328, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.028734931722283363, "pnorm/_forward_module.model.norm.weight": 22.96953582763672, "gnorm/_forward_module.model.norm.weight": 0.006650272291153669, "pnorm/_forward_module.lm_head.weight": 235.58450317382812, "gnorm/_forward_module.lm_head.weight": 0.020221102982759476} +{"step": 2055208960, "pnorm/_forward_module.model.embeddings.weight": 103.60652160644531, "gnorm/_forward_module.model.embeddings.weight": 0.05936156585812569, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 15.8015775680542, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0018662511138245463, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 7.557305812835693, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.012016385793685913, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 7.598154067993164, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.011950550600886345, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 5.614856719970703, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08681917935609818, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 5.60628604888916, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11506971716880798, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.3321036100387573, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.011788624338805676, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19187764823436737, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0011800781358033419, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 15.79047966003418, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0023262440226972103, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 14.187394142150879, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.08802345395088196, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 10.113612174987793, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05851119011640549, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 17.448129653930664, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0032797076273709536, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.741141319274902, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.013045825064182281, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 8.784686088562012, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.025647467002272606, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 6.447733402252197, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.05667996034026146, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 7.169867038726807, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03115263767540455, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.4833855628967285, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.003922324161976576, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.18821613490581512, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00038530403980985284, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 16.36051368713379, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0009817547397688031, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 15.811050415039062, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.035792771726846695, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 10.969770431518555, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.028753137215971947, "pnorm/_forward_module.model.norm.weight": 22.969799041748047, "gnorm/_forward_module.model.norm.weight": 0.006390242371708155, "pnorm/_forward_module.lm_head.weight": 235.5863037109375, "gnorm/_forward_module.lm_head.weight": 0.020662635564804077} diff --git a/metrics/jsonlines/resume.jsonl b/metrics/jsonlines/resume.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92733e5e5f3807c0d022c001b1110b1f10275c5c --- /dev/null +++ b/metrics/jsonlines/resume.jsonl @@ -0,0 +1 @@ +{"step": 0, "resume/resume_step": 0} diff --git a/metrics/jsonlines/throughput.jsonl b/metrics/jsonlines/throughput.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..72685aa8c80ea19e9e30cd15b020079e99eafbd8 --- /dev/null +++ b/metrics/jsonlines/throughput.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "throughput/token_count": 20971520, "throughput/batch_count": 10, "throughput/flop_count": 0, "throughput/total_time": 41.97735304199159, "throughput/update_time": 41.75535514205694, "throughput/token_count_per_second_total_recent": 591486.4633138517, "throughput/token_count_per_second_total_cum": 499591.29102355184, "throughput/token_count_per_second_update_recent": 594516.7768200949, "throughput/token_count_per_second_update_cum": 502247.434578206, "throughput/batch_count_per_second_total_recent": 0.282042724282194, "throughput/batch_count_per_second_total_cum": 0.23822369147470085, "throughput/batch_count_per_second_update_recent": 0.28348769036297555, "throughput/batch_count_per_second_update_cum": 0.23949023941908168, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 41943040, "throughput/token_count": 41943040, "throughput/batch_count": 20, "throughput/flop_count": 0, "throughput/total_time": 77.36710998200579, "throughput/update_time": 76.98762350907782, "throughput/token_count_per_second_total_recent": 592065.3330984612, "throughput/token_count_per_second_total_cum": 542130.1120043802, "throughput/token_count_per_second_update_recent": 594895.1807018473, "throughput/token_count_per_second_update_cum": 544802.3732678848, "throughput/batch_count_per_second_total_recent": 0.2823187509052568, "throughput/batch_count_per_second_total_cum": 0.2585077819845105, "throughput/batch_count_per_second_update_recent": 0.2836681273946034, "throughput/batch_count_per_second_update_cum": 0.25978201545137636, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 62914560, "throughput/token_count": 62914560, "throughput/batch_count": 30, "throughput/flop_count": 0, "throughput/total_time": 188.23102539300453, "throughput/update_time": 112.21908622107003, "throughput/token_count_per_second_total_recent": 341356.8379230524, "throughput/token_count_per_second_total_cum": 334241.17978766625, "throughput/token_count_per_second_update_recent": 595017.4043987896, "throughput/token_count_per_second_update_cum": 560640.4589327986, "throughput/batch_count_per_second_total_recent": 0.16277162452843302, "throughput/batch_count_per_second_total_cum": 0.15937861432441056, "throughput/batch_count_per_second_update_recent": 0.28372640819491846, "throughput/batch_count_per_second_update_cum": 0.2673342032112115, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 83886080, "throughput/token_count": 83886080, "throughput/batch_count": 40, "throughput/flop_count": 0, "throughput/total_time": 223.6157359869685, "throughput/update_time": 147.44638093106914, "throughput/token_count_per_second_total_recent": 382999.41323640116, "throughput/token_count_per_second_total_cum": 375134.9592181141, "throughput/token_count_per_second_update_recent": 595095.0139669307, "throughput/token_count_per_second_update_cum": 568926.0019153442, "throughput/batch_count_per_second_total_recent": 0.18262835180110987, "throughput/batch_count_per_second_total_cum": 0.1788782878962107, "throughput/batch_count_per_second_update_recent": 0.2837634153208402, "throughput/batch_count_per_second_update_cum": 0.27128505798117836, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 104857600, "throughput/token_count": 104857600, "throughput/batch_count": 50, "throughput/flop_count": 0, "throughput/total_time": 333.5220514299581, "throughput/update_time": 182.68076597800246, "throughput/token_count_per_second_total_recent": 317696.51124845445, "throughput/token_count_per_second_total_cum": 314394.80403298256, "throughput/token_count_per_second_update_recent": 595116.5178628893, "throughput/token_count_per_second_update_cum": 573993.6519240699, "throughput/batch_count_per_second_total_recent": 0.15148950159476016, "throughput/batch_count_per_second_total_cum": 0.1499151249089158, "throughput/batch_count_per_second_update_recent": 0.2837736691774794, "throughput/batch_count_per_second_update_cum": 0.27370150181010716, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 125829120, "throughput/token_count": 125829120, "throughput/batch_count": 60, "throughput/flop_count": 0, "throughput/total_time": 368.9031989739742, "throughput/update_time": 217.90960857702885, "throughput/token_count_per_second_total_recent": 344814.90359686426, "throughput/token_count_per_second_total_cum": 341089.8044526774, "throughput/token_count_per_second_update_recent": 595146.5987387104, "throughput/token_count_per_second_update_cum": 577437.2264797157, "throughput/batch_count_per_second_total_recent": 0.16442055873721326, "throughput/batch_count_per_second_total_cum": 0.1626442930472743, "throughput/batch_count_per_second_update_recent": 0.2837880128568222, "throughput/batch_count_per_second_update_cum": 0.2753435261152819, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 146800640, "throughput/token_count": 146800640, "throughput/batch_count": 70, "throughput/flop_count": 0, "throughput/total_time": 478.86176590196555, "throughput/update_time": 253.13642803102266, "throughput/token_count_per_second_total_recent": 308671.4896522468, "throughput/token_count_per_second_total_cum": 306561.62269186805, "throughput/token_count_per_second_update_recent": 595172.9149378286, "throughput/token_count_per_second_update_cum": 579926.9632658684, "throughput/batch_count_per_second_total_recent": 0.14718603594410268, "throughput/batch_count_per_second_total_cum": 0.146179972978529, "throughput/batch_count_per_second_update_recent": 0.2838005613984244, "throughput/batch_count_per_second_update_cum": 0.27653072512906474, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 167772160, "throughput/token_count": 167772160, "throughput/batch_count": 80, "throughput/flop_count": 0, "throughput/total_time": 514.2439901359612, "throughput/update_time": 288.3617576470133, "throughput/token_count_per_second_total_recent": 328605.0490856572, "throughput/token_count_per_second_total_cum": 326250.1132111289, "throughput/token_count_per_second_update_recent": 595195.7559858554, "throughput/token_count_per_second_update_cum": 581811.4072025171, "throughput/batch_count_per_second_total_recent": 0.1566910977772032, "throughput/batch_count_per_second_total_cum": 0.15556817684704252, "throughput/batch_count_per_second_update_recent": 0.2838114528588559, "throughput/batch_count_per_second_update_cum": 0.27742929802060945, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 188743680, "throughput/token_count": 188743680, "throughput/batch_count": 90, "throughput/flop_count": 0, "throughput/total_time": 624.0890151349595, "throughput/update_time": 323.589894320874, "throughput/token_count_per_second_total_recent": 303973.8197501924, "throughput/token_count_per_second_total_cum": 302430.70367002714, "throughput/token_count_per_second_update_recent": 595208.1373121269, "throughput/token_count_per_second_update_cum": 583280.5143563614, "throughput/batch_count_per_second_total_recent": 0.1449460123778307, "throughput/batch_count_per_second_total_cum": 0.14421019729138715, "throughput/batch_count_per_second_update_recent": 0.28381735673529, "throughput/batch_count_per_second_update_cum": 0.2781298229009444, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 209715200, "throughput/token_count": 209715200, "throughput/batch_count": 100, "throughput/flop_count": 0, "throughput/total_time": 659.4609651200008, "throughput/update_time": 358.814332414011, "throughput/token_count_per_second_total_recent": 319710.61193747685, "throughput/token_count_per_second_total_cum": 318010.02802620555, "throughput/token_count_per_second_update_recent": 595224.3291449603, "throughput/token_count_per_second_update_cum": 584467.1771862897, "throughput/batch_count_per_second_total_recent": 0.15244989964364855, "throughput/batch_count_per_second_total_cum": 0.151638998044112, "throughput/batch_count_per_second_update_recent": 0.28382507760284437, "throughput/batch_count_per_second_update_cum": 0.2786956678325127, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 230686720, "throughput/token_count": 230686720, "throughput/batch_count": 110, "throughput/flop_count": 0, "throughput/total_time": 770.1425092020072, "throughput/update_time": 394.0443641850143, "throughput/token_count_per_second_total_recent": 286519.1112306368, "throughput/token_count_per_second_total_cum": 299537.7053514796, "throughput/token_count_per_second_update_recent": 595292.2396901635, "throughput/token_count_per_second_update_cum": 585433.3698620963, "throughput/batch_count_per_second_total_recent": 0.13662295877010192, "throughput/batch_count_per_second_total_cum": 0.1428307081944845, "throughput/batch_count_per_second_update_recent": 0.2838574598742311, "throughput/batch_count_per_second_update_cum": 0.279156384402321, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 251658240, "throughput/token_count": 251658240, "throughput/batch_count": 120, "throughput/flop_count": 0, "throughput/total_time": 805.5059107720153, "throughput/update_time": 429.2700012290734, "throughput/token_count_per_second_total_recent": 319846.01833327446, "throughput/token_count_per_second_total_cum": 312422.58639518236, "throughput/token_count_per_second_update_recent": 595304.2528225371, "throughput/token_count_per_second_update_cum": 586246.975748269, "throughput/batch_count_per_second_total_recent": 0.15251446644462321, "throughput/batch_count_per_second_total_cum": 0.14897469825514906, "throughput/batch_count_per_second_update_recent": 0.28386318818213324, "throughput/batch_count_per_second_update_cum": 0.27954434192098093, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 272629760, "throughput/token_count": 272629760, "throughput/batch_count": 130, "throughput/flop_count": 0, "throughput/total_time": 916.1022540619597, "throughput/update_time": 464.49939665506827, "throughput/token_count_per_second_total_recent": 286634.7610557263, "throughput/token_count_per_second_total_cum": 297597.5212277569, "throughput/token_count_per_second_update_recent": 595306.6901521117, "throughput/token_count_per_second_update_cum": 586932.4308346768, "throughput/batch_count_per_second_total_recent": 0.1366781049040443, "throughput/batch_count_per_second_total_cum": 0.14190555631053778, "throughput/batch_count_per_second_update_recent": 0.2838643503914412, "throughput/batch_count_per_second_update_cum": 0.2798711923764595, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 293601280, "throughput/token_count": 293601280, "throughput/batch_count": 140, "throughput/flop_count": 0, "throughput/total_time": 951.4780202080146, "throughput/update_time": 499.73381894716294, "throughput/token_count_per_second_total_recent": 319513.5223273322, "throughput/token_count_per_second_total_cum": 308573.8963636933, "throughput/token_count_per_second_update_recent": 595296.4823191583, "throughput/token_count_per_second_update_cum": 587515.330898673, "throughput/batch_count_per_second_total_recent": 0.1523559199940358, "throughput/batch_count_per_second_total_cum": 0.14713949983772912, "throughput/batch_count_per_second_update_recent": 0.2838594829173843, "throughput/batch_count_per_second_update_cum": 0.2801491407864919, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 314572800, "throughput/token_count": 314572800, "throughput/batch_count": 150, "throughput/flop_count": 0, "throughput/total_time": 1061.4905991089763, "throughput/update_time": 534.9619731742423, "throughput/token_count_per_second_total_recent": 286597.03462072957, "throughput/token_count_per_second_total_cum": 296350.05742307554, "throughput/token_count_per_second_update_recent": 595303.9863618565, "throughput/token_count_per_second_update_cum": 588028.338039535, "throughput/batch_count_per_second_total_recent": 0.13666011553799132, "throughput/batch_count_per_second_total_cum": 0.14131071921495225, "throughput/batch_count_per_second_update_recent": 0.2838630611237795, "throughput/batch_count_per_second_update_cum": 0.2803937616536784, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 335544320, "throughput/token_count": 335544320, "throughput/batch_count": 160, "throughput/flop_count": 0, "throughput/total_time": 1096.8764924500138, "throughput/update_time": 570.1910730601521, "throughput/token_count_per_second_total_recent": 319485.02441097674, "throughput/token_count_per_second_total_cum": 305908.93533557176, "throughput/token_count_per_second_update_recent": 595305.5521538941, "throughput/token_count_per_second_update_cum": 588476.9787769053, "throughput/batch_count_per_second_total_recent": 0.15234233112858617, "throughput/batch_count_per_second_total_cum": 0.14586874739435757, "throughput/batch_count_per_second_update_recent": 0.2838638077516051, "throughput/batch_count_per_second_update_cum": 0.28060769022794024, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 356515840, "throughput/token_count": 356515840, "throughput/batch_count": 170, "throughput/flop_count": 0, "throughput/total_time": 1206.983663285966, "throughput/update_time": 605.4215191720868, "throughput/token_count_per_second_total_recent": 286535.92763456627, "throughput/token_count_per_second_total_cum": 295377.51905390294, "throughput/token_count_per_second_update_recent": 595296.7634330908, "throughput/token_count_per_second_update_cum": 588872.0977205022, "throughput/batch_count_per_second_total_recent": 0.13663097745636285, "throughput/batch_count_per_second_total_cum": 0.1408469767827525, "throughput/batch_count_per_second_update_recent": 0.283859616962953, "throughput/batch_count_per_second_update_cum": 0.2807960976221572, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 377487360, "throughput/token_count": 377487360, "throughput/batch_count": 180, "throughput/flop_count": 0, "throughput/total_time": 1242.364284639014, "throughput/update_time": 640.6497024221462, "throughput/token_count_per_second_total_recent": 319360.9599071597, "throughput/token_count_per_second_total_cum": 303845.9529683632, "throughput/token_count_per_second_update_recent": 595294.0372351639, "throughput/token_count_per_second_update_cum": 589225.8414743796, "throughput/batch_count_per_second_total_recent": 0.15228317256315216, "throughput/batch_count_per_second_total_cum": 0.14488504074495467, "throughput/batch_count_per_second_update_recent": 0.2838583170104808, "throughput/batch_count_per_second_update_cum": 0.2809647757884882, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 398458880, "throughput/token_count": 398458880, "throughput/batch_count": 190, "throughput/flop_count": 0, "throughput/total_time": 1352.8268752049771, "throughput/update_time": 675.885561926174, "throughput/token_count_per_second_total_recent": 286291.56150809745, "throughput/token_count_per_second_total_cum": 294537.96882888395, "throughput/token_count_per_second_update_recent": 595278.4021794002, "throughput/token_count_per_second_update_cum": 589536.013854847, "throughput/batch_count_per_second_total_recent": 0.13651445460705636, "throughput/batch_count_per_second_total_cum": 0.14044664803928564, "throughput/batch_count_per_second_update_recent": 0.2838508616349221, "throughput/batch_count_per_second_update_cum": 0.2811126775049434, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 419430400, "throughput/token_count": 419430400, "throughput/batch_count": 200, "throughput/flop_count": 0, "throughput/total_time": 1388.2071847780026, "throughput/update_time": 711.1109357241658, "throughput/token_count_per_second_total_recent": 319462.9790955568, "throughput/token_count_per_second_total_cum": 302138.1855670729, "throughput/token_count_per_second_update_recent": 595280.2158439511, "throughput/token_count_per_second_update_cum": 589824.1454729838, "throughput/batch_count_per_second_total_recent": 0.15233181910302962, "throughput/batch_count_per_second_total_cum": 0.14407071379045147, "throughput/batch_count_per_second_update_recent": 0.2838517264575725, "throughput/batch_count_per_second_update_cum": 0.2812500693669242, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 440401920, "throughput/token_count": 440401920, "throughput/batch_count": 210, "throughput/flop_count": 0, "throughput/total_time": 1498.7065829180065, "throughput/update_time": 746.3382132861298, "throughput/token_count_per_second_total_recent": 286360.79841749644, "throughput/token_count_per_second_total_cum": 293854.6644283968, "throughput/token_count_per_second_update_recent": 595281.7891878844, "throughput/token_count_per_second_update_cum": 590083.5735864425, "throughput/batch_count_per_second_total_recent": 0.13654746933817694, "throughput/batch_count_per_second_total_cum": 0.14012082311076965, "throughput/batch_count_per_second_update_recent": 0.28385247668642255, "throughput/batch_count_per_second_update_cum": 0.28137377433130384, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 461373440, "throughput/token_count": 461373440, "throughput/batch_count": 220, "throughput/flop_count": 0, "throughput/total_time": 1534.1124866969767, "throughput/update_time": 781.5830888972268, "throughput/token_count_per_second_total_recent": 319487.13914319506, "throughput/token_count_per_second_total_cum": 300742.9011893129, "throughput/token_count_per_second_update_recent": 595250.3004978693, "throughput/token_count_per_second_update_cum": 590306.323862475, "throughput/batch_count_per_second_total_recent": 0.1523433395114875, "throughput/batch_count_per_second_total_cum": 0.1434053903528752, "throughput/batch_count_per_second_update_recent": 0.2838374617089602, "throughput/batch_count_per_second_update_cum": 0.28147998993991613, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 482344960, "throughput/token_count": 482344960, "throughput/batch_count": 230, "throughput/flop_count": 0, "throughput/total_time": 1645.136862017971, "throughput/update_time": 816.84872851416, "throughput/token_count_per_second_total_recent": 286176.3658199966, "throughput/token_count_per_second_total_cum": 293194.4272456105, "throughput/token_count_per_second_update_recent": 595194.3755195029, "throughput/token_count_per_second_update_cum": 590494.8409204001, "throughput/batch_count_per_second_total_recent": 0.13645952502250508, "throughput/batch_count_per_second_total_cum": 0.1398059974887898, "throughput/batch_count_per_second_update_recent": 0.2838107946012034, "throughput/batch_count_per_second_update_cum": 0.28156988187808996, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 503316480, "throughput/token_count": 503316480, "throughput/batch_count": 240, "throughput/flop_count": 0, "throughput/total_time": 1680.500316324993, "throughput/update_time": 852.0965187721886, "throughput/token_count_per_second_total_recent": 318991.8799716411, "throughput/token_count_per_second_total_cum": 299503.94838406163, "throughput/token_count_per_second_update_recent": 595165.2237325924, "throughput/token_count_per_second_update_cum": 590680.127088471, "throughput/batch_count_per_second_total_recent": 0.1521071815355497, "throughput/batch_count_per_second_total_cum": 0.14281461161807138, "throughput/batch_count_per_second_update_recent": 0.28379689394597646, "throughput/batch_count_per_second_update_cum": 0.2816582332079272, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 524288000, "throughput/token_count": 524288000, "throughput/batch_count": 250, "throughput/flop_count": 0, "throughput/total_time": 1792.9284177150112, "throughput/update_time": 887.3467938142712, "throughput/token_count_per_second_total_recent": 285233.107720737, "throughput/token_count_per_second_total_cum": 292419.9286596039, "throughput/token_count_per_second_update_recent": 595127.3214809344, "throughput/token_count_per_second_update_cum": 590849.0385662426, "throughput/batch_count_per_second_total_recent": 0.13600974451100206, "throughput/batch_count_per_second_total_cum": 0.13943668778400606, "throughput/batch_count_per_second_update_recent": 0.28377882074400634, "throughput/batch_count_per_second_update_cum": 0.2817387764769757, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 545259520, "throughput/token_count": 545259520, "throughput/batch_count": 260, "throughput/flop_count": 0, "throughput/total_time": 1828.322631730989, "throughput/update_time": 922.6015984143014, "throughput/token_count_per_second_total_recent": 317860.7811650038, "throughput/token_count_per_second_total_cum": 298229.37731934554, "throughput/token_count_per_second_update_recent": 595084.2355823057, "throughput/token_count_per_second_update_cum": 591002.1410510791, "throughput/batch_count_per_second_total_recent": 0.1515678315949458, "throughput/batch_count_per_second_total_cum": 0.14220684877364423, "throughput/batch_count_per_second_update_recent": 0.2837582757865456, "throughput/batch_count_per_second_update_cum": 0.281811781430759, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 566231040, "throughput/token_count": 566231040, "throughput/batch_count": 270, "throughput/flop_count": 0, "throughput/total_time": 1940.5543955019675, "throughput/update_time": 957.8616747342749, "throughput/token_count_per_second_total_recent": 284395.9741555774, "throughput/token_count_per_second_total_cum": 291788.28550875623, "throughput/token_count_per_second_update_recent": 595032.7344551204, "throughput/token_count_per_second_update_cum": 591140.7199344111, "throughput/batch_count_per_second_total_recent": 0.13561056812075492, "throughput/batch_count_per_second_total_cum": 0.1391354968589574, "throughput/batch_count_per_second_update_recent": 0.2837337181354143, "throughput/batch_count_per_second_update_cum": 0.281877860991674, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 587202560, "throughput/token_count": 587202560, "throughput/batch_count": 280, "throughput/flop_count": 0, "throughput/total_time": 1975.955018882989, "throughput/update_time": 993.1225579883321, "throughput/token_count_per_second_total_recent": 316987.08283694665, "throughput/token_count_per_second_total_cum": 297174.0522372552, "throughput/token_count_per_second_update_recent": 594981.4261858589, "throughput/token_count_per_second_update_cum": 591268.9781102514, "throughput/batch_count_per_second_total_recent": 0.15115121976706822, "throughput/batch_count_per_second_total_cum": 0.1417036305605198, "throughput/batch_count_per_second_update_recent": 0.28370925244610734, "throughput/batch_count_per_second_update_cum": 0.281939019255758, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 608174080, "throughput/token_count": 608174080, "throughput/batch_count": 290, "throughput/flop_count": 0, "throughput/total_time": 2087.7305572130135, "throughput/update_time": 1028.3721907203435, "throughput/token_count_per_second_total_recent": 283879.05885318376, "throughput/token_count_per_second_total_cum": 291308.70259995305, "throughput/token_count_per_second_update_recent": 594952.7761751487, "throughput/token_count_per_second_update_cum": 591394.910799749, "throughput/batch_count_per_second_total_recent": 0.13536408369692982, "throughput/batch_count_per_second_total_cum": 0.1389068139076009, "throughput/batch_count_per_second_update_recent": 0.2836955910564178, "throughput/batch_count_per_second_update_cum": 0.2819990686415429, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 629145600, "throughput/token_count": 629145600, "throughput/batch_count": 300, "throughput/flop_count": 0, "throughput/total_time": 2123.1104451339925, "throughput/update_time": 1063.6117942532874, "throughput/token_count_per_second_total_recent": 316371.0048831023, "throughput/token_count_per_second_total_cum": 296332.0167549238, "throughput/token_count_per_second_update_recent": 594929.0460062356, "throughput/token_count_per_second_update_cum": 591518.0739808306, "throughput/batch_count_per_second_total_recent": 0.1508574509063255, "throughput/batch_count_per_second_total_cum": 0.1413021167540187, "throughput/batch_count_per_second_update_recent": 0.2836842756301096, "throughput/batch_count_per_second_update_cum": 0.2820577974228051, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 650117120, "throughput/token_count": 650117120, "throughput/batch_count": 310, "throughput/flop_count": 0, "throughput/total_time": 2233.8338464839617, "throughput/update_time": 1098.8516714693396, "throughput/token_count_per_second_total_recent": 283793.7978348482, "throughput/token_count_per_second_total_cum": 291031.9946236286, "throughput/token_count_per_second_update_recent": 594906.4754702086, "throughput/token_count_per_second_update_cum": 591633.1902473151, "throughput/batch_count_per_second_total_recent": 0.1353234280752412, "throughput/batch_count_per_second_total_cum": 0.13877486926251822, "throughput/batch_count_per_second_update_recent": 0.28367351315985134, "throughput/batch_count_per_second_update_cum": 0.28211268913617854, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 671088640, "throughput/token_count": 671088640, "throughput/batch_count": 320, "throughput/flop_count": 0, "throughput/total_time": 2269.22137821099, "throughput/update_time": 1134.096047840314, "throughput/token_count_per_second_total_recent": 316521.28867066826, "throughput/token_count_per_second_total_cum": 295735.2008242903, "throughput/token_count_per_second_update_recent": 594911.1034106904, "throughput/token_count_per_second_update_cum": 591738.8049080764, "throughput/batch_count_per_second_total_recent": 0.15092911180051244, "throughput/batch_count_per_second_total_cum": 0.14101753274168505, "throughput/batch_count_per_second_update_recent": 0.28367571993383905, "throughput/batch_count_per_second_update_cum": 0.28216305013088055, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 692060160, "throughput/token_count": 692060160, "throughput/batch_count": 330, "throughput/flop_count": 0, "throughput/total_time": 2379.2475059829885, "throughput/update_time": 1169.336060541391, "throughput/token_count_per_second_total_recent": 284187.0479358176, "throughput/token_count_per_second_total_cum": 290873.5464720282, "throughput/token_count_per_second_update_recent": 594954.7488372319, "throughput/token_count_per_second_update_cum": 591840.2616263994, "throughput/batch_count_per_second_total_recent": 0.13551094433585054, "throughput/batch_count_per_second_total_cum": 0.13869931529618654, "throughput/batch_count_per_second_update_recent": 0.28369653169499964, "throughput/batch_count_per_second_update_cum": 0.2822114284641263, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 713031680, "throughput/token_count": 713031680, "throughput/batch_count": 340, "throughput/flop_count": 0, "throughput/total_time": 2414.640490865975, "throughput/update_time": 1204.5815564935328, "throughput/token_count_per_second_total_recent": 317673.7988759598, "throughput/token_count_per_second_total_cum": 295295.17238579964, "throughput/token_count_per_second_update_recent": 594959.0403712426, "throughput/token_count_per_second_update_cum": 591933.0875989783, "throughput/batch_count_per_second_total_recent": 0.15147867149160377, "throughput/batch_count_per_second_total_cum": 0.1408077108315466, "throughput/batch_count_per_second_update_recent": 0.2836985780578816, "throughput/batch_count_per_second_update_cum": 0.28225569133709827, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 734003200, "throughput/token_count": 734003200, "throughput/batch_count": 350, "throughput/flop_count": 0, "throughput/total_time": 2525.280265790003, "throughput/update_time": 1239.831252818578, "throughput/token_count_per_second_total_recent": 284873.4167550294, "throughput/token_count_per_second_total_cum": 290662.0742036235, "throughput/token_count_per_second_update_recent": 594960.0426968696, "throughput/token_count_per_second_update_cum": 592018.6302219349, "throughput/batch_count_per_second_total_recent": 0.13583823049308272, "throughput/batch_count_per_second_total_cum": 0.13859847746068168, "throughput/batch_count_per_second_update_recent": 0.2836990560039852, "throughput/batch_count_per_second_update_cum": 0.2822964812383341, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 754974720, "throughput/token_count": 754974720, "throughput/batch_count": 360, "throughput/flop_count": 0, "throughput/total_time": 2560.696846612962, "throughput/update_time": 1275.0923811426037, "throughput/token_count_per_second_total_recent": 318435.27043063816, "throughput/token_count_per_second_total_cum": 294831.74511602434, "throughput/token_count_per_second_update_recent": 594955.699911866, "throughput/token_count_per_second_update_cum": 592094.1346410298, "throughput/batch_count_per_second_total_recent": 0.1518417694237891, "throughput/batch_count_per_second_total_cum": 0.14058673148919312, "throughput/batch_count_per_second_update_recent": 0.2836969852027254, "throughput/batch_count_per_second_update_cum": 0.2823324845509671, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 775946240, "throughput/token_count": 775946240, "throughput/batch_count": 370, "throughput/flop_count": 0, "throughput/total_time": 2670.3778264659923, "throughput/update_time": 1310.3158643786446, "throughput/token_count_per_second_total_recent": 285864.8917642585, "throughput/token_count_per_second_total_cum": 290575.45052600134, "throughput/token_count_per_second_update_recent": 595011.9416942113, "throughput/token_count_per_second_update_cum": 592182.5882555088, "throughput/batch_count_per_second_total_recent": 0.13631100261891294, "throughput/batch_count_per_second_total_cum": 0.13855717207241122, "throughput/batch_count_per_second_update_recent": 0.28372380337439124, "throughput/batch_count_per_second_update_cum": 0.2823746625211281, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 796917760, "throughput/token_count": 796917760, "throughput/batch_count": 380, "throughput/flop_count": 0, "throughput/total_time": 2705.7711879389826, "throughput/update_time": 1345.540644572582, "throughput/token_count_per_second_total_recent": 319466.0937625695, "throughput/token_count_per_second_total_cum": 294525.1851125747, "throughput/token_count_per_second_update_recent": 595072.7919875617, "throughput/token_count_per_second_update_cum": 592265.8399168202, "throughput/batch_count_per_second_total_recent": 0.1523333042919967, "throughput/batch_count_per_second_total_cum": 0.14044055228832947, "throughput/batch_count_per_second_update_recent": 0.28375281905534827, "throughput/batch_count_per_second_update_cum": 0.2824143600067235, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 817889280, "throughput/token_count": 817889280, "throughput/batch_count": 390, "throughput/flop_count": 0, "throughput/total_time": 2815.635458876961, "throughput/update_time": 1380.7573516704724, "throughput/token_count_per_second_total_recent": 286621.67400053504, "throughput/token_count_per_second_total_cum": 290481.2401837778, "throughput/token_count_per_second_update_recent": 595128.9088324043, "throughput/token_count_per_second_update_cum": 592348.3072608656, "throughput/batch_count_per_second_total_recent": 0.13667186450983765, "throughput/batch_count_per_second_total_cum": 0.138512249080552, "throughput/batch_count_per_second_update_recent": 0.2837795776521703, "throughput/batch_count_per_second_update_cum": 0.28245368350070266, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 838860800, "throughput/token_count": 838860800, "throughput/batch_count": 400, "throughput/flop_count": 0, "throughput/total_time": 2851.024827848014, "throughput/update_time": 1415.9805577184306, "throughput/token_count_per_second_total_recent": 319887.6747051427, "throughput/token_count_per_second_total_cum": 294231.32054349093, "throughput/token_count_per_second_update_recent": 595155.691496797, "throughput/token_count_per_second_update_cum": 592423.953441604, "throughput/batch_count_per_second_total_recent": 0.1525343297506059, "throughput/batch_count_per_second_total_cum": 0.14030042674231097, "throughput/batch_count_per_second_update_recent": 0.28379234862174846, "throughput/batch_count_per_second_update_cum": 0.2824897544105549, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 859832320, "throughput/token_count": 859832320, "throughput/batch_count": 410, "throughput/flop_count": 0, "throughput/total_time": 2961.242350918008, "throughput/update_time": 1451.2064463463612, "throughput/token_count_per_second_total_recent": 286820.189007782, "throughput/token_count_per_second_total_cum": 290362.0231331101, "throughput/token_count_per_second_update_recent": 595182.4147182646, "throughput/token_count_per_second_update_cum": 592494.832258196, "throughput/batch_count_per_second_total_recent": 0.13676652384175397, "throughput/batch_count_per_second_total_cum": 0.13845540196090225, "throughput/batch_count_per_second_update_recent": 0.2838050912467311, "throughput/batch_count_per_second_update_cum": 0.2825235520640354, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 880803840, "throughput/token_count": 880803840, "throughput/batch_count": 420, "throughput/flop_count": 0, "throughput/total_time": 2996.6310340749915, "throughput/update_time": 1486.4294549234328, "throughput/token_count_per_second_total_recent": 319788.3268183603, "throughput/token_count_per_second_total_cum": 293931.36158048536, "throughput/token_count_per_second_update_recent": 595215.5289590794, "throughput/token_count_per_second_update_cum": 592563.4997897502, "throughput/batch_count_per_second_total_recent": 0.15248695698659911, "throughput/batch_count_per_second_total_cum": 0.14015739516281384, "throughput/batch_count_per_second_update_recent": 0.2838208813472173, "throughput/batch_count_per_second_update_cum": 0.28255629529464255, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 901775360, "throughput/token_count": 901775360, "throughput/batch_count": 430, "throughput/flop_count": 0, "throughput/total_time": 3107.195180976996, "throughput/update_time": 1521.6597315414692, "throughput/token_count_per_second_total_recent": 286605.1593876383, "throughput/token_count_per_second_total_cum": 290221.66535301285, "throughput/token_count_per_second_update_recent": 595233.4996634347, "throughput/token_count_per_second_update_cum": 592626.1576801307, "throughput/batch_count_per_second_total_recent": 0.13666398972875515, "throughput/batch_count_per_second_total_cum": 0.138388474155909, "throughput/batch_count_per_second_update_recent": 0.28382945044681296, "throughput/batch_count_per_second_update_cum": 0.2825861729050306, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 922746880, "throughput/token_count": 922746880, "throughput/batch_count": 440, "throughput/flop_count": 0, "throughput/total_time": 3142.596222635999, "throughput/update_time": 1556.8869810854085, "throughput/token_count_per_second_total_recent": 319833.5394862985, "throughput/token_count_per_second_total_cum": 293625.6568226901, "throughput/token_count_per_second_update_recent": 595264.0730032647, "throughput/token_count_per_second_update_cum": 592687.1322134715, "throughput/batch_count_per_second_total_recent": 0.15250851606669355, "throughput/batch_count_per_second_total_cum": 0.14001162377485757, "throughput/batch_count_per_second_update_recent": 0.2838440289512943, "throughput/batch_count_per_second_update_cum": 0.28261524782823155, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 943718400, "throughput/token_count": 943718400, "throughput/batch_count": 450, "throughput/flop_count": 0, "throughput/total_time": 3252.3157649660134, "throughput/update_time": 1592.1117133093649, "throughput/token_count_per_second_total_recent": 286965.71797701943, "throughput/token_count_per_second_total_cum": 290168.1350149781, "throughput/token_count_per_second_update_recent": 595310.1057275194, "throughput/token_count_per_second_update_cum": 592746.3456935356, "throughput/batch_count_per_second_total_recent": 0.13683591746188137, "throughput/batch_count_per_second_total_cum": 0.13836294890164286, "throughput/batch_count_per_second_update_recent": 0.2838659790647122, "throughput/batch_count_per_second_update_cum": 0.2826434830157927, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 964689920, "throughput/token_count": 964689920, "throughput/batch_count": 460, "throughput/flop_count": 0, "throughput/total_time": 3287.7011176730157, "throughput/update_time": 1627.3387761343038, "throughput/token_count_per_second_total_recent": 319821.5098093481, "throughput/token_count_per_second_total_cum": 293423.8501226026, "throughput/token_count_per_second_update_recent": 595364.803504357, "throughput/token_count_per_second_update_cum": 592802.1467611023, "throughput/batch_count_per_second_total_recent": 0.1525027798697224, "throughput/batch_count_per_second_total_cum": 0.1399153948414815, "throughput/batch_count_per_second_update_recent": 0.2838920609971795, "throughput/batch_count_per_second_update_cum": 0.28267009103827584, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 985661440, "throughput/token_count": 985661440, "throughput/batch_count": 470, "throughput/flop_count": 0, "throughput/total_time": 3397.5104132720153, "throughput/update_time": 1662.5627456933726, "throughput/token_count_per_second_total_recent": 286927.90893671697, "throughput/token_count_per_second_total_cum": 290112.853267386, "throughput/token_count_per_second_update_recent": 595362.1753914726, "throughput/token_count_per_second_update_cum": 592856.6861931755, "throughput/batch_count_per_second_total_recent": 0.13681788870654915, "throughput/batch_count_per_second_total_cum": 0.13833658851022052, "throughput/batch_count_per_second_update_recent": 0.2838908078153003, "throughput/batch_count_per_second_update_cum": 0.28269609746607566, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1006632960, "throughput/token_count": 1006632960, "throughput/batch_count": 480, "throughput/flop_count": 0, "throughput/total_time": 3432.890017031983, "throughput/update_time": 1697.7829640183481, "throughput/token_count_per_second_total_recent": 319855.0313718802, "throughput/token_count_per_second_total_cum": 293231.9284933915, "throughput/token_count_per_second_update_recent": 595369.8346379318, "throughput/token_count_per_second_update_cum": 592910.2725930763, "throughput/batch_count_per_second_total_recent": 0.15251876419633875, "throughput/batch_count_per_second_total_cum": 0.13982387947721076, "throughput/batch_count_per_second_update_recent": 0.28389446002861585, "throughput/batch_count_per_second_update_cum": 0.2827216494527227, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1027604480, "throughput/token_count": 1027604480, "throughput/batch_count": 490, "throughput/flop_count": 0, "throughput/total_time": 3542.651402093994, "throughput/update_time": 1733.0092777723912, "throughput/token_count_per_second_total_recent": 286974.5471350274, "throughput/token_count_per_second_total_cum": 290066.4963514622, "throughput/token_count_per_second_update_recent": 595352.5996938106, "throughput/token_count_per_second_update_cum": 592959.595300541, "throughput/batch_count_per_second_total_recent": 0.13684012753249522, "throughput/batch_count_per_second_total_cum": 0.13831448381016836, "throughput/batch_count_per_second_update_recent": 0.2838862417668393, "throughput/batch_count_per_second_update_cum": 0.28274516835238506, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1048576000, "throughput/token_count": 1048576000, "throughput/batch_count": 500, "throughput/flop_count": 0, "throughput/total_time": 3578.0405803910107, "throughput/update_time": 1768.2423270724248, "throughput/token_count_per_second_total_recent": 320079.251584883, "throughput/token_count_per_second_total_cum": 293058.72206888464, "throughput/token_count_per_second_update_recent": 595337.6213132192, "throughput/token_count_per_second_update_cum": 593004.6939528169, "throughput/batch_count_per_second_total_recent": 0.15262568072551871, "throughput/batch_count_per_second_total_cum": 0.1397412882179664, "throughput/batch_count_per_second_update_recent": 0.2838790995184036, "throughput/batch_count_per_second_update_cum": 0.28276667306557507, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1069547520, "throughput/token_count": 1069547520, "throughput/batch_count": 510, "throughput/flop_count": 0, "throughput/total_time": 3688.537352617015, "throughput/update_time": 1803.4673421693733, "throughput/token_count_per_second_total_recent": 286864.35444967396, "throughput/token_count_per_second_total_cum": 289965.2132412748, "throughput/token_count_per_second_update_recent": 595339.235934773, "throughput/token_count_per_second_update_cum": 593050.6724415934, "throughput/batch_count_per_second_total_recent": 0.1367875835655565, "throughput/batch_count_per_second_total_cum": 0.13826618825973264, "throughput/batch_count_per_second_update_recent": 0.2838798694299569, "throughput/batch_count_per_second_update_cum": 0.2827885973175017, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1090519040, "throughput/token_count": 1090519040, "throughput/batch_count": 520, "throughput/flop_count": 0, "throughput/total_time": 3723.92296086601, "throughput/update_time": 1838.693349173409, "throughput/token_count_per_second_total_recent": 320109.7331048894, "throughput/token_count_per_second_total_cum": 292841.4608626588, "throughput/token_count_per_second_update_recent": 595333.0535020924, "throughput/token_count_per_second_update_cum": 593094.5692984894, "throughput/batch_count_per_second_total_recent": 0.15264021544689627, "throughput/batch_count_per_second_total_cum": 0.13963769000180187, "throughput/batch_count_per_second_update_recent": 0.2838769214163267, "throughput/batch_count_per_second_update_cum": 0.28280952896999806, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1111490560, "throughput/token_count": 1111490560, "throughput/batch_count": 530, "throughput/flop_count": 0, "throughput/total_time": 3833.784007055976, "throughput/update_time": 1873.921633931459, "throughput/token_count_per_second_total_recent": 287144.8858864552, "throughput/token_count_per_second_total_cum": 289919.97409200197, "throughput/token_count_per_second_update_recent": 595336.4657895181, "throughput/token_count_per_second_update_cum": 593136.0948473122, "throughput/batch_count_per_second_total_recent": 0.1369213513786579, "throughput/batch_count_per_second_total_cum": 0.13824461655235384, "throughput/batch_count_per_second_update_recent": 0.28387854852176575, "throughput/batch_count_per_second_update_cum": 0.28282932989469156, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1132462080, "throughput/token_count": 1132462080, "throughput/batch_count": 540, "throughput/flop_count": 0, "throughput/total_time": 3869.1690179569996, "throughput/update_time": 1909.1470028955955, "throughput/token_count_per_second_total_recent": 320047.1728768683, "throughput/token_count_per_second_total_cum": 292688.7077675307, "throughput/token_count_per_second_update_recent": 595339.1545270239, "throughput/token_count_per_second_update_cum": 593176.9938524375, "throughput/batch_count_per_second_total_recent": 0.15261038440555014, "throughput/batch_count_per_second_total_cum": 0.13956485165001425, "throughput/batch_count_per_second_update_recent": 0.2838798306117172, "throughput/batch_count_per_second_update_cum": 0.28284883206006883, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1153433600, "throughput/token_count": 1153433600, "throughput/batch_count": 550, "throughput/flop_count": 0, "throughput/total_time": 3979.0184022249887, "throughput/update_time": 1944.3775492714485, "throughput/token_count_per_second_total_recent": 287099.6087215058, "throughput/token_count_per_second_total_cum": 289878.93078228104, "throughput/token_count_per_second_update_recent": 595329.5261256426, "throughput/token_count_per_second_update_cum": 593214.8313645092, "throughput/batch_count_per_second_total_recent": 0.13689976154399194, "throughput/batch_count_per_second_total_cum": 0.13822504557718326, "throughput/batch_count_per_second_update_recent": 0.2838752394321645, "throughput/batch_count_per_second_update_cum": 0.2828668743917986, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1174405120, "throughput/token_count": 1174405120, "throughput/batch_count": 560, "throughput/flop_count": 0, "throughput/total_time": 4014.40222099697, "throughput/update_time": 1979.6119488844415, "throughput/token_count_per_second_total_recent": 320036.8457429895, "throughput/token_count_per_second_total_cum": 292547.94496111514, "throughput/token_count_per_second_update_recent": 595317.1917205523, "throughput/token_count_per_second_update_cum": 593250.1673683093, "throughput/batch_count_per_second_total_recent": 0.152605460044379, "throughput/batch_count_per_second_total_cum": 0.13949773071342236, "throughput/batch_count_per_second_update_recent": 0.2838693579294931, "throughput/batch_count_per_second_update_cum": 0.2828837239114329, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1195376640, "throughput/token_count": 1195376640, "throughput/batch_count": 570, "throughput/flop_count": 0, "throughput/total_time": 4124.598098332004, "throughput/update_time": 2014.8426124633406, "throughput/token_count_per_second_total_recent": 286946.23615193326, "throughput/token_count_per_second_total_cum": 289816.5133915503, "throughput/token_count_per_second_update_recent": 595305.2381260183, "throughput/token_count_per_second_update_cum": 593285.3676042398, "throughput/batch_count_per_second_total_recent": 0.13682662780377067, "throughput/batch_count_per_second_total_cum": 0.13819528264596476, "throughput/batch_count_per_second_update_recent": 0.2838636580114452, "throughput/batch_count_per_second_update_cum": 0.2829005086919021, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1216348160, "throughput/token_count": 1216348160, "throughput/batch_count": 580, "throughput/flop_count": 0, "throughput/total_time": 4159.974407147965, "throughput/update_time": 2050.0642815562896, "throughput/token_count_per_second_total_recent": 319822.6265514626, "throughput/token_count_per_second_total_cum": 292393.18345564423, "throughput/token_count_per_second_update_recent": 595303.4015338363, "throughput/token_count_per_second_update_cum": 593321.9611419302, "throughput/batch_count_per_second_total_recent": 0.15250331237385875, "throughput/batch_count_per_second_total_cum": 0.13942393467695438, "throughput/batch_count_per_second_update_recent": 0.28386278225604833, "throughput/batch_count_per_second_update_cum": 0.28291795785042295, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1237319680, "throughput/token_count": 1237319680, "throughput/batch_count": 590, "throughput/flop_count": 0, "throughput/total_time": 4270.076159979973, "throughput/update_time": 2085.291462272231, "throughput/token_count_per_second_total_recent": 286811.3154753876, "throughput/token_count_per_second_total_cum": 289765.24859121087, "throughput/token_count_per_second_update_recent": 595301.4199040746, "throughput/token_count_per_second_update_cum": 593355.7502085386, "throughput/batch_count_per_second_total_recent": 0.1367622926117838, "throughput/batch_count_per_second_total_cum": 0.13817083768425506, "throughput/batch_count_per_second_update_recent": 0.2838618373413442, "throughput/batch_count_per_second_update_cum": 0.28293406973292284, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1258291200, "throughput/token_count": 1258291200, "throughput/batch_count": 600, "throughput/flop_count": 0, "throughput/total_time": 4305.46517852298, "throughput/update_time": 2120.5240447262186, "throughput/token_count_per_second_total_recent": 320020.00963279215, "throughput/token_count_per_second_total_cum": 292254.4133620576, "throughput/token_count_per_second_update_recent": 595303.263989338, "throughput/token_count_per_second_update_cum": 593386.9050574517, "throughput/batch_count_per_second_total_recent": 0.1525974319614373, "throughput/batch_count_per_second_total_cum": 0.13935776393988494, "throughput/batch_count_per_second_update_recent": 0.2838627166697206, "throughput/batch_count_per_second_update_cum": 0.2829489255225428, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1279262720, "throughput/token_count": 1279262720, "throughput/batch_count": 610, "throughput/flop_count": 0, "throughput/total_time": 4416.261696751986, "throughput/update_time": 2155.7523099503014, "throughput/token_count_per_second_total_recent": 286692.59308566153, "throughput/token_count_per_second_total_cum": 289670.9497403325, "throughput/token_count_per_second_update_recent": 595296.6956362277, "throughput/token_count_per_second_update_cum": 593418.2299586597, "throughput/batch_count_per_second_total_recent": 0.13670568136485173, "throughput/batch_count_per_second_total_cum": 0.1381258724881804, "throughput/batch_count_per_second_update_recent": 0.2838595846348895, "throughput/batch_count_per_second_update_cum": 0.28296386239941584, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1300234240, "throughput/token_count": 1300234240, "throughput/batch_count": 620, "throughput/flop_count": 0, "throughput/total_time": 4451.645092452003, "throughput/update_time": 2190.979344193125, "throughput/token_count_per_second_total_recent": 319554.0677439271, "throughput/token_count_per_second_total_cum": 292079.49263624253, "throughput/token_count_per_second_update_recent": 595295.7328357437, "throughput/token_count_per_second_update_cum": 593448.880951837, "throughput/batch_count_per_second_total_recent": 0.1523752535552631, "throughput/batch_count_per_second_total_cum": 0.13927435523807646, "throughput/batch_count_per_second_update_recent": 0.28385912553584275, "throughput/batch_count_per_second_update_cum": 0.2829784779318986, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1321205760, "throughput/token_count": 1321205760, "throughput/batch_count": 630, "throughput/flop_count": 0, "throughput/total_time": 4562.018341306015, "throughput/update_time": 2226.215099543275, "throughput/token_count_per_second_total_recent": 286491.4882416739, "throughput/token_count_per_second_total_cum": 289609.91849536606, "throughput/token_count_per_second_update_recent": 595281.8743760437, "throughput/token_count_per_second_update_cum": 593476.2369867384, "throughput/batch_count_per_second_total_recent": 0.1366097871025438, "throughput/batch_count_per_second_total_cum": 0.138096770522769, "throughput/batch_count_per_second_update_recent": 0.2838525173073023, "throughput/batch_count_per_second_update_cum": 0.2829915223058407, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1342177280, "throughput/token_count": 1342177280, "throughput/batch_count": 640, "throughput/flop_count": 0, "throughput/total_time": 4597.405511869991, "throughput/update_time": 2261.444451206189, "throughput/token_count_per_second_total_recent": 319301.82449310325, "throughput/token_count_per_second_total_cum": 291942.3306329292, "throughput/token_count_per_second_update_recent": 595276.4514250555, "throughput/token_count_per_second_update_cum": 593504.4211606089, "throughput/batch_count_per_second_total_recent": 0.15225497460036433, "throughput/batch_count_per_second_total_cum": 0.1392089512982031, "throughput/batch_count_per_second_update_recent": 0.28384993144276405, "throughput/batch_count_per_second_update_cum": 0.2830049615672154, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1363148800, "throughput/token_count": 1363148800, "throughput/batch_count": 650, "throughput/flop_count": 0, "throughput/total_time": 4707.963436296966, "throughput/update_time": 2296.670486221323, "throughput/token_count_per_second_total_recent": 286210.85490317375, "throughput/token_count_per_second_total_cum": 289541.07618817454, "throughput/token_count_per_second_update_recent": 595284.4433388836, "throughput/token_count_per_second_update_cum": 593532.5978097834, "throughput/batch_count_per_second_total_recent": 0.13647597069891632, "throughput/batch_count_per_second_total_cum": 0.13806394395264365, "throughput/batch_count_per_second_update_recent": 0.2838537422842424, "throughput/batch_count_per_second_update_cum": 0.28301839724053546, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1384120320, "throughput/token_count": 1384120320, "throughput/batch_count": 660, "throughput/flop_count": 0, "throughput/total_time": 4743.360288269003, "throughput/update_time": 2331.894208611455, "throughput/token_count_per_second_total_recent": 319110.52026537637, "throughput/token_count_per_second_total_cum": 291801.64184937085, "throughput/token_count_per_second_update_recent": 595309.4594602975, "throughput/token_count_per_second_update_cum": 593560.5118313603, "throughput/batch_count_per_second_total_recent": 0.1521637536360628, "throughput/batch_count_per_second_total_cum": 0.13914186565846007, "throughput/batch_count_per_second_update_recent": 0.2838656709004867, "throughput/batch_count_per_second_update_cum": 0.2830317076832582, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1405091840, "throughput/token_count": 1405091840, "throughput/batch_count": 670, "throughput/flop_count": 0, "throughput/total_time": 4853.4431308630155, "throughput/update_time": 2367.121892035415, "throughput/token_count_per_second_total_recent": 286250.1339840854, "throughput/token_count_per_second_total_cum": 289504.131832725, "throughput/token_count_per_second_update_recent": 595307.723665768, "throughput/token_count_per_second_update_cum": 593586.6018254788, "throughput/batch_count_per_second_total_recent": 0.13649470042423506, "throughput/batch_count_per_second_total_cum": 0.13804632751117946, "throughput/batch_count_per_second_update_recent": 0.28386484320915606, "throughput/batch_count_per_second_update_cum": 0.2830441483619112, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1426063360, "throughput/token_count": 1426063360, "throughput/batch_count": 680, "throughput/flop_count": 0, "throughput/total_time": 4888.828857862973, "throughput/update_time": 2402.3454629034386, "throughput/token_count_per_second_total_recent": 319120.49608261324, "throughput/token_count_per_second_total_cum": 291698.35996741505, "throughput/token_count_per_second_update_recent": 595303.1806521083, "throughput/token_count_per_second_update_cum": 593612.9428598005, "throughput/batch_count_per_second_total_recent": 0.15216851047640478, "throughput/batch_count_per_second_total_cum": 0.1390926170193744, "throughput/batch_count_per_second_update_recent": 0.28386267693143286, "throughput/batch_count_per_second_update_cum": 0.2830567087458613, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1447034880, "throughput/token_count": 1447034880, "throughput/batch_count": 690, "throughput/flop_count": 0, "throughput/total_time": 4998.851999056002, "throughput/update_time": 2437.5750640144106, "throughput/token_count_per_second_total_recent": 286277.7910064956, "throughput/token_count_per_second_total_cum": 289473.43915628275, "throughput/token_count_per_second_update_recent": 595300.6752630881, "throughput/token_count_per_second_update_cum": 593637.0540388188, "throughput/batch_count_per_second_total_recent": 0.1365078883202055, "throughput/batch_count_per_second_total_cum": 0.13803169210256708, "throughput/batch_count_per_second_update_recent": 0.28386148226885227, "throughput/batch_count_per_second_update_cum": 0.2830682058519453, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1468006400, "throughput/token_count": 1468006400, "throughput/batch_count": 700, "throughput/flop_count": 0, "throughput/total_time": 5034.233835026971, "throughput/update_time": 2472.8065605463926, "throughput/token_count_per_second_total_recent": 319504.39242521033, "throughput/token_count_per_second_total_cum": 291604.73035359813, "throughput/token_count_per_second_update_recent": 595301.8366201824, "throughput/token_count_per_second_update_cum": 593660.0231583131, "throughput/batch_count_per_second_total_recent": 0.15235156651745335, "throughput/batch_count_per_second_total_cum": 0.13904797094039828, "throughput/batch_count_per_second_update_recent": 0.2838620360470688, "throughput/batch_count_per_second_update_cum": 0.2830791583816114, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1488977920, "throughput/token_count": 1488977920, "throughput/batch_count": 710, "throughput/flop_count": 0, "throughput/total_time": 5144.364876898995, "throughput/update_time": 2508.037551375397, "throughput/token_count_per_second_total_recent": 286543.1133607748, "throughput/token_count_per_second_total_cum": 289438.6295743374, "throughput/token_count_per_second_update_recent": 595297.2183242893, "throughput/token_count_per_second_update_cum": 593682.4666693889, "throughput/batch_count_per_second_total_recent": 0.13663440387762776, "throughput/batch_count_per_second_total_cum": 0.138015093600434, "throughput/batch_count_per_second_update_recent": 0.2838598338719794, "throughput/batch_count_per_second_update_cum": 0.2830898602816529, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1509949440, "throughput/token_count": 1509949440, "throughput/batch_count": 720, "throughput/flop_count": 0, "throughput/total_time": 5179.741817264992, "throughput/update_time": 2543.2725221014116, "throughput/token_count_per_second_total_recent": 319622.7222986866, "throughput/token_count_per_second_total_cum": 291510.56042350846, "throughput/token_count_per_second_update_recent": 595286.4315954153, "throughput/token_count_per_second_update_cum": 593703.3593051148, "throughput/batch_count_per_second_total_recent": 0.15240799059805232, "throughput/batch_count_per_second_total_cum": 0.13900306721854613, "throughput/batch_count_per_second_update_recent": 0.2838546903588368, "throughput/batch_count_per_second_update_cum": 0.2830998226666998, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1530920960, "throughput/token_count": 1530920960, "throughput/batch_count": 730, "throughput/flop_count": 0, "throughput/total_time": 5289.151538159, "throughput/update_time": 2578.4985305793816, "throughput/token_count_per_second_total_recent": 286928.20548304514, "throughput/token_count_per_second_total_cum": 289445.47134924197, "throughput/token_count_per_second_update_recent": 595299.9054239372, "throughput/token_count_per_second_update_cum": 593725.7445929225, "throughput/batch_count_per_second_total_recent": 0.13681803011085755, "throughput/batch_count_per_second_total_cum": 0.13801835601293658, "throughput/batch_count_per_second_update_recent": 0.2838611151809393, "throughput/batch_count_per_second_update_cum": 0.2831104968037236, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1551892480, "throughput/token_count": 1551892480, "throughput/batch_count": 740, "throughput/flop_count": 0, "throughput/total_time": 5324.524823835993, "throughput/update_time": 2613.7285769192385, "throughput/token_count_per_second_total_recent": 320199.55050546443, "throughput/token_count_per_second_total_cum": 291461.216041802, "throughput/token_count_per_second_update_recent": 595297.7357206044, "throughput/token_count_per_second_update_cum": 593746.6092325439, "throughput/batch_count_per_second_total_recent": 0.15268304372094366, "throughput/batch_count_per_second_total_cum": 0.13897953798379994, "throughput/batch_count_per_second_update_recent": 0.28386008058576795, "throughput/batch_count_per_second_update_cum": 0.28312044583918755, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1572864000, "throughput/token_count": 1572864000, "throughput/batch_count": 750, "throughput/flop_count": 0, "throughput/total_time": 5434.649355244997, "throughput/update_time": 2648.9550557791954, "throughput/token_count_per_second_total_recent": 287106.57558374084, "throughput/token_count_per_second_total_cum": 289414.07203797315, "throughput/token_count_per_second_update_recent": 595297.3760949532, "throughput/token_count_per_second_update_cum": 593767.7185456584, "throughput/batch_count_per_second_total_recent": 0.1369030836027817, "throughput/batch_count_per_second_total_cum": 0.13800338365458162, "throughput/batch_count_per_second_update_recent": 0.2838599091028944, "throughput/batch_count_per_second_update_cum": 0.2831305115440647, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1593835520, "throughput/token_count": 1593835520, "throughput/batch_count": 760, "throughput/flop_count": 0, "throughput/total_time": 5470.030363440979, "throughput/update_time": 2684.182179984171, "throughput/token_count_per_second_total_recent": 320185.9291884688, "throughput/token_count_per_second_total_cum": 291375.99137518887, "throughput/token_count_per_second_update_recent": 595290.2896123148, "throughput/token_count_per_second_update_cum": 593788.1310311803, "throughput/batch_count_per_second_total_recent": 0.15267654857085647, "throughput/batch_count_per_second_total_cum": 0.13893889969596332, "throughput/batch_count_per_second_update_recent": 0.28385653000465144, "throughput/batch_count_per_second_update_cum": 0.2831402449756528, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1614807040, "throughput/token_count": 1614807040, "throughput/batch_count": 770, "throughput/flop_count": 0, "throughput/total_time": 5579.999253949965, "throughput/update_time": 2719.416021748213, "throughput/token_count_per_second_total_recent": 287156.12521573994, "throughput/token_count_per_second_total_cum": 289391.9813442469, "throughput/token_count_per_second_update_recent": 595280.8345585034, "throughput/token_count_per_second_update_cum": 593806.54783445, "throughput/batch_count_per_second_total_recent": 0.13692671070849416, "throughput/batch_count_per_second_total_cum": 0.13799284999096245, "throughput/batch_count_per_second_update_recent": 0.28385202148366134, "throughput/batch_count_per_second_update_cum": 0.28314902679178716, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1635778560, "throughput/token_count": 1635778560, "throughput/batch_count": 780, "throughput/flop_count": 0, "throughput/total_time": 5615.37508792599, "throughput/update_time": 2754.6453251581406, "throughput/token_count_per_second_total_recent": 320216.5724799191, "throughput/token_count_per_second_total_cum": 291303.5254790373, "throughput/token_count_per_second_update_recent": 595271.3303561785, "throughput/token_count_per_second_update_cum": 593825.4718531113, "throughput/batch_count_per_second_total_recent": 0.15269116043086964, "throughput/batch_count_per_second_total_cum": 0.13890434526397577, "throughput/batch_count_per_second_update_recent": 0.28384748952683375, "throughput/batch_count_per_second_update_cum": 0.2831580504670674, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1656750080, "throughput/token_count": 1656750080, "throughput/batch_count": 790, "throughput/flop_count": 0, "throughput/total_time": 5724.889530220011, "throughput/update_time": 2789.872535758186, "throughput/token_count_per_second_total_recent": 287362.10652755736, "throughput/token_count_per_second_total_cum": 289394.2444224474, "throughput/token_count_per_second_update_recent": 595276.1190335038, "throughput/token_count_per_second_update_cum": 593844.3634127376, "throughput/batch_count_per_second_total_recent": 0.13702493025186413, "throughput/batch_count_per_second_total_cum": 0.13799392911074038, "throughput/batch_count_per_second_update_recent": 0.2838497729461211, "throughput/batch_count_per_second_update_cum": 0.2831670586646736, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1677721600, "throughput/token_count": 1677721600, "throughput/batch_count": 800, "throughput/flop_count": 0, "throughput/total_time": 5760.269799852977, "throughput/update_time": 2825.100109060295, "throughput/token_count_per_second_total_recent": 320523.54177099414, "throughput/token_count_per_second_total_cum": 291257.4685378143, "throughput/token_count_per_second_update_recent": 595282.228384882, "throughput/token_count_per_second_update_cum": 593862.7075973091, "throughput/batch_count_per_second_total_recent": 0.15283753479528148, "throughput/batch_count_per_second_total_cum": 0.13888238360300745, "throughput/batch_count_per_second_update_recent": 0.28385268611187076, "throughput/batch_count_per_second_update_cum": 0.28317580585351426, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1698693120, "throughput/token_count": 1698693120, "throughput/batch_count": 810, "throughput/flop_count": 0, "throughput/total_time": 5870.409012118005, "throughput/update_time": 2860.3205174013856, "throughput/token_count_per_second_total_recent": 287360.3696961762, "throughput/token_count_per_second_total_cum": 289365.3775219868, "throughput/token_count_per_second_update_recent": 595304.7902771322, "throughput/token_count_per_second_update_cum": 593882.0875722244, "throughput/batch_count_per_second_total_recent": 0.13702410206612406, "throughput/batch_count_per_second_total_cum": 0.1379801642999586, "throughput/batch_count_per_second_update_recent": 0.28386344446045503, "throughput/batch_count_per_second_update_cum": 0.2831850469456789, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1719664640, "throughput/token_count": 1719664640, "throughput/batch_count": 820, "throughput/flop_count": 0, "throughput/total_time": 5905.784063692961, "throughput/update_time": 2895.5508844144642, "throughput/token_count_per_second_total_recent": 320169.47303043323, "throughput/token_count_per_second_total_cum": 291183.1217419541, "throughput/token_count_per_second_update_recent": 595306.458886581, "throughput/token_count_per_second_update_cum": 593898.953479365, "throughput/batch_count_per_second_total_recent": 0.15266870166322385, "throughput/batch_count_per_second_total_cum": 0.13884693228814798, "throughput/batch_count_per_second_update_recent": 0.28386424011544276, "throughput/batch_count_per_second_update_cum": 0.28319308923691033, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1740636160, "throughput/token_count": 1740636160, "throughput/batch_count": 830, "throughput/flop_count": 0, "throughput/total_time": 6015.045884961961, "throughput/update_time": 2930.7809853444924, "throughput/token_count_per_second_total_recent": 287418.7182619779, "throughput/token_count_per_second_total_cum": 289380.3627253639, "throughput/token_count_per_second_update_recent": 595301.7283410962, "throughput/token_count_per_second_update_cum": 593915.4678238096, "throughput/batch_count_per_second_total_recent": 0.13705192483042616, "throughput/batch_count_per_second_total_cum": 0.13798730980175206, "throughput/batch_count_per_second_update_recent": 0.28386198441557703, "throughput/batch_count_per_second_update_cum": 0.2832009638899849, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1761607680, "throughput/token_count": 1761607680, "throughput/batch_count": 840, "throughput/flop_count": 0, "throughput/total_time": 6050.427405752998, "throughput/update_time": 2966.012128848466, "throughput/token_count_per_second_total_recent": 320586.3509958243, "throughput/token_count_per_second_total_cum": 291154.2543796146, "throughput/token_count_per_second_update_recent": 595298.2090235355, "throughput/token_count_per_second_update_cum": 593931.3810843829, "throughput/batch_count_per_second_total_recent": 0.15286748456755844, "throughput/batch_count_per_second_total_cum": 0.1388331672571252, "throughput/batch_count_per_second_update_recent": 0.28386030627419256, "throughput/batch_count_per_second_update_cum": 0.2832085519239344, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1782579200, "throughput/token_count": 1782579200, "throughput/batch_count": 850, "throughput/flop_count": 0, "throughput/total_time": 6159.759159684996, "throughput/update_time": 3001.234070145467, "throughput/token_count_per_second_total_recent": 287733.0290517822, "throughput/token_count_per_second_total_cum": 289391.05471311306, "throughput/token_count_per_second_update_recent": 595308.2237220665, "throughput/token_count_per_second_update_cum": 593948.741863243, "throughput/batch_count_per_second_total_recent": 0.13720179989422904, "throughput/batch_count_per_second_total_cum": 0.1379924081388059, "throughput/batch_count_per_second_update_recent": 0.28386508165458035, "throughput/batch_count_per_second_update_cum": 0.2832168301883903, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1803550720, "throughput/token_count": 1803550720, "throughput/batch_count": 860, "throughput/flop_count": 0, "throughput/total_time": 6195.132035595016, "throughput/update_time": 3036.459684428468, "throughput/token_count_per_second_total_recent": 320910.5734840143, "throughput/token_count_per_second_total_cum": 291123.8549295547, "throughput/token_count_per_second_update_recent": 595310.4776530535, "throughput/token_count_per_second_update_cum": 593964.9814054653, "throughput/batch_count_per_second_total_recent": 0.1530220858974525, "throughput/batch_count_per_second_total_cum": 0.13881867166974768, "throughput/batch_count_per_second_update_recent": 0.28386615641262697, "throughput/batch_count_per_second_update_cum": 0.28322457380555405, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1824522240, "throughput/token_count": 1824522240, "throughput/batch_count": 870, "throughput/flop_count": 0, "throughput/total_time": 6304.553084079002, "throughput/update_time": 3071.6873910723953, "throughput/token_count_per_second_total_recent": 287953.38225368614, "throughput/token_count_per_second_total_cum": 289397.55374691, "throughput/token_count_per_second_update_recent": 595320.2410656475, "throughput/token_count_per_second_update_cum": 593980.443876816, "throughput/batch_count_per_second_total_recent": 0.13730687248882587, "throughput/batch_count_per_second_total_cum": 0.1379955071196127, "throughput/batch_count_per_second_update_recent": 0.28387081197054265, "throughput/batch_count_per_second_update_cum": 0.2832319468864517, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1845493760, "throughput/token_count": 1845493760, "throughput/batch_count": 880, "throughput/flop_count": 0, "throughput/total_time": 6339.925039035967, "throughput/update_time": 3106.918331800378, "throughput/token_count_per_second_total_recent": 320956.4177530707, "throughput/token_count_per_second_total_cum": 291090.7855592913, "throughput/token_count_per_second_update_recent": 595317.369288677, "throughput/token_count_per_second_update_cum": 593994.9373984943, "throughput/batch_count_per_second_total_recent": 0.15304394614842926, "throughput/batch_count_per_second_total_cum": 0.13880290296520773, "throughput/batch_count_per_second_update_recent": 0.28386944260057306, "throughput/batch_count_per_second_update_cum": 0.2832388579361412, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1866465280, "throughput/token_count": 1866465280, "throughput/batch_count": 890, "throughput/flop_count": 0, "throughput/total_time": 6449.384100833966, "throughput/update_time": 3142.1480970325065, "throughput/token_count_per_second_total_recent": 287977.5860376513, "throughput/token_count_per_second_total_cum": 289402.0965131614, "throughput/token_count_per_second_update_recent": 595314.085203707, "throughput/token_count_per_second_update_cum": 594009.3281289698, "throughput/batch_count_per_second_total_recent": 0.13731841375238957, "throughput/batch_count_per_second_total_cum": 0.13799767327936238, "throughput/batch_count_per_second_update_recent": 0.28386787662682866, "throughput/batch_count_per_second_update_cum": 0.28324571997116554, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1887436800, "throughput/token_count": 1887436800, "throughput/batch_count": 900, "throughput/flop_count": 0, "throughput/total_time": 6484.7479819079745, "throughput/update_time": 3177.374425999529, "throughput/token_count_per_second_total_recent": 321311.38188549445, "throughput/token_count_per_second_total_cum": 291057.84916635556, "throughput/token_count_per_second_update_recent": 595313.4168316369, "throughput/token_count_per_second_update_cum": 594024.042163761, "throughput/batch_count_per_second_total_recent": 0.15321320623659823, "throughput/batch_count_per_second_total_cum": 0.13878719766919878, "throughput/batch_count_per_second_update_recent": 0.28386755792219015, "throughput/batch_count_per_second_update_cum": 0.2832527361697011, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1908408320, "throughput/token_count": 1908408320, "throughput/batch_count": 910, "throughput/flop_count": 0, "throughput/total_time": 6595.026013074967, "throughput/update_time": 3212.6112683996907, "throughput/token_count_per_second_total_recent": 287928.6686586852, "throughput/token_count_per_second_total_cum": 289370.8555836604, "throughput/token_count_per_second_update_recent": 595287.4700670408, "throughput/token_count_per_second_update_cum": 594036.4894974182, "throughput/batch_count_per_second_total_recent": 0.13729508812841662, "throughput/batch_count_per_second_total_cum": 0.13798277644331952, "throughput/batch_count_per_second_update_recent": 0.28385518554069555, "throughput/batch_count_per_second_update_cum": 0.2832586715209094, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1929379840, "throughput/token_count": 1929379840, "throughput/batch_count": 920, "throughput/flop_count": 0, "throughput/total_time": 6630.405630423978, "throughput/update_time": 3247.8434043628513, "throughput/token_count_per_second_total_recent": 320793.1174572437, "throughput/token_count_per_second_total_cum": 290989.71428639826, "throughput/token_count_per_second_update_recent": 595283.1746500942, "throughput/token_count_per_second_update_cum": 594049.5275752058, "throughput/batch_count_per_second_total_recent": 0.15296607849943336, "throughput/batch_count_per_second_total_cum": 0.13875470842666543, "throughput/batch_count_per_second_update_recent": 0.2838531373262855, "throughput/batch_count_per_second_update_cum": 0.28326488856087007, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1950351360, "throughput/token_count": 1950351360, "throughput/batch_count": 930, "throughput/flop_count": 0, "throughput/total_time": 6740.169189045962, "throughput/update_time": 3283.0864938918385, "throughput/token_count_per_second_total_recent": 287726.2435832469, "throughput/token_count_per_second_total_cum": 289362.3743406451, "throughput/token_count_per_second_update_recent": 595262.8185089469, "throughput/token_count_per_second_update_cum": 594060.3038112508, "throughput/batch_count_per_second_total_recent": 0.13719856433069558, "throughput/batch_count_per_second_total_cum": 0.13797873227150206, "throughput/batch_count_per_second_update_recent": 0.2838434307617888, "throughput/batch_count_per_second_update_cum": 0.2832700270706419, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1971322880, "throughput/token_count": 1971322880, "throughput/batch_count": 940, "throughput/flop_count": 0, "throughput/total_time": 6775.5438796009985, "throughput/update_time": 3318.3166800447507, "throughput/token_count_per_second_total_recent": 320591.5338415864, "throughput/token_count_per_second_total_cum": 290946.8103269207, "throughput/token_count_per_second_update_recent": 595262.2172577728, "throughput/token_count_per_second_update_cum": 594073.1612069692, "throughput/batch_count_per_second_total_recent": 0.15286995594100303, "throughput/batch_count_per_second_total_cum": 0.1387342502245525, "throughput/batch_count_per_second_update_recent": 0.2838431440628876, "throughput/batch_count_per_second_update_cum": 0.28327615795467814, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1992294400, "throughput/token_count": 1992294400, "throughput/batch_count": 950, "throughput/flop_count": 0, "throughput/total_time": 6886.654366842995, "throughput/update_time": 3353.5464223485906, "throughput/token_count_per_second_total_recent": 287022.2985556804, "throughput/token_count_per_second_total_cum": 289297.8642274035, "throughput/token_count_per_second_update_recent": 595250.448677634, "throughput/token_count_per_second_update_cum": 594085.8270883084, "throughput/batch_count_per_second_total_recent": 0.13686289718421954, "throughput/batch_count_per_second_total_cum": 0.13794797145242857, "throughput/batch_count_per_second_update_recent": 0.2838375323665781, "throughput/batch_count_per_second_update_cum": 0.2832821975175421, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2013265920, "throughput/token_count": 2013265920, "throughput/batch_count": 960, "throughput/flop_count": 0, "throughput/total_time": 6922.033780722995, "throughput/update_time": 3388.7756955446093, "throughput/token_count_per_second_total_recent": 319745.9419055339, "throughput/token_count_per_second_total_cum": 290848.90131664707, "throughput/token_count_per_second_update_recent": 595244.8028650397, "throughput/token_count_per_second_update_cum": 594098.3118614018, "throughput/batch_count_per_second_total_recent": 0.15246674628521628, "throughput/batch_count_per_second_total_cum": 0.13868756357033113, "throughput/batch_count_per_second_update_recent": 0.2838348402333449, "throughput/batch_count_per_second_update_cum": 0.28328815072126473, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2034237440, "throughput/token_count": 2034237440, "throughput/batch_count": 970, "throughput/flop_count": 0, "throughput/total_time": 7032.169445707987, "throughput/update_time": 3424.0070785805583, "throughput/token_count_per_second_total_recent": 286734.88748149923, "throughput/token_count_per_second_total_cum": 289275.94189892226, "throughput/token_count_per_second_update_recent": 595238.3065569528, "throughput/token_count_per_second_update_cum": 594110.1736399753, "throughput/batch_count_per_second_total_recent": 0.1367258489043709, "throughput/batch_count_per_second_total_cum": 0.1379375180716144, "throughput/batch_count_per_second_update_recent": 0.2838317425522579, "throughput/batch_count_per_second_update_cum": 0.28329380685805094, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2055208960, "throughput/token_count": 2055208960, "throughput/batch_count": 980, "throughput/flop_count": 0, "throughput/total_time": 7067.550158863014, "throughput/update_time": 3459.239731548645, "throughput/token_count_per_second_total_recent": 319413.4155637033, "throughput/token_count_per_second_total_cum": 290795.10067893594, "throughput/token_count_per_second_update_recent": 595236.3510792654, "throughput/token_count_per_second_update_cum": 594121.5756908286, "throughput/batch_count_per_second_total_recent": 0.1523081853693501, "throughput/batch_count_per_second_total_cum": 0.13866190942713544, "throughput/batch_count_per_second_update_recent": 0.2838308101078345, "throughput/batch_count_per_second_update_cum": 0.28329924377957755, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} diff --git a/metrics/jsonlines/train.jsonl b/metrics/jsonlines/train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..29b03655a93b7d2f958f4d98e640b2ac9d0d12ac --- /dev/null +++ b/metrics/jsonlines/train.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 41.97735304199159, "train/update_time": 41.75535514205694, "train/lr": 0.0009000000000000001, "train/loss": 9.76972484588623, "train/global_grad_norm": 1.1996515989303589} +{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 77.36710998200579, "train/update_time": 76.98762350907782, "train/lr": 0.0009997960964140947, "train/loss": 8.079014778137207, "train/global_grad_norm": 0.9576020836830139} +{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 188.23102539300453, "train/update_time": 112.21908622107003, "train/lr": 0.0009990914580222257, "train/loss": 7.444537162780762, "train/global_grad_norm": 0.42209190130233765} +{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 223.6157359869685, "train/update_time": 147.44638093106914, "train/lr": 0.0009978842768382998, "train/loss": 7.105719089508057, "train/global_grad_norm": 0.3969765603542328} +{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 333.5220514299581, "train/update_time": 182.68076597800246, "train/lr": 0.0009961757683914405, "train/loss": 6.857090473175049, "train/global_grad_norm": 0.5435984134674072} +{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 368.9031989739742, "train/update_time": 217.90960857702885, "train/lr": 0.00099396765300483, "train/loss": 6.596991539001465, "train/global_grad_norm": 0.43834808468818665} +{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 478.86176590196555, "train/update_time": 253.13642803102266, "train/lr": 0.0009912621540634887, "train/loss": 6.422722339630127, "train/global_grad_norm": 0.6763378381729126} +{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 514.2439901359612, "train/update_time": 288.3617576470133, "train/lr": 0.000988061995775515, "train/loss": 6.240057945251465, "train/global_grad_norm": 0.45933476090431213} +{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 624.0890151349595, "train/update_time": 323.589894320874, "train/lr": 0.0009843704004290394, "train/loss": 6.051237106323242, "train/global_grad_norm": 0.360037237405777} +{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 659.4609651200008, "train/update_time": 358.814332414011, "train/lr": 0.0009801910851476522, "train/loss": 5.918942928314209, "train/global_grad_norm": 0.62600177526474} +{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 770.1425092020072, "train/update_time": 394.0443641850143, "train/lr": 0.0009755282581475768, "train/loss": 5.784972667694092, "train/global_grad_norm": 0.44626691937446594} +{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 805.5059107720153, "train/update_time": 429.2700012290734, "train/lr": 0.0009703866145003512, "train/loss": 5.645214557647705, "train/global_grad_norm": 0.4792501628398895} +{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 916.1022540619597, "train/update_time": 464.49939665506827, "train/lr": 0.0009647713314052896, "train/loss": 5.578414440155029, "train/global_grad_norm": 0.679843544960022} +{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 951.4780202080146, "train/update_time": 499.73381894716294, "train/lr": 0.0009586880629764817, "train/loss": 5.486634254455566, "train/global_grad_norm": 0.6061164736747742} +{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1061.4905991089763, "train/update_time": 534.9619731742423, "train/lr": 0.0009521429345495787, "train/loss": 5.374250411987305, "train/global_grad_norm": 0.7468693256378174} +{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1096.8764924500138, "train/update_time": 570.1910730601521, "train/lr": 0.0009451425365140996, "train/loss": 5.32179069519043, "train/global_grad_norm": 0.4829612672328949} +{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1206.983663285966, "train/update_time": 605.4215191720868, "train/lr": 0.000937693917677468, "train/loss": 5.216552734375, "train/global_grad_norm": 0.40984582901000977} +{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1242.364284639014, "train/update_time": 640.6497024221462, "train/lr": 0.0009298045781674596, "train/loss": 5.194259166717529, "train/global_grad_norm": 0.6644951105117798} +{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1352.8268752049771, "train/update_time": 675.885561926174, "train/lr": 0.0009214824618802108, "train/loss": 5.163832187652588, "train/global_grad_norm": 0.6656083464622498} +{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1388.2071847780026, "train/update_time": 711.1109357241658, "train/lr": 0.000912735948481387, "train/loss": 5.068937301635742, "train/global_grad_norm": 0.4672752618789673} +{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 1498.7065829180065, "train/update_time": 746.3382132861298, "train/lr": 0.0009035738449685707, "train/loss": 5.02679443359375, "train/global_grad_norm": 0.6419569253921509} +{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 1534.1124866969767, "train/update_time": 781.5830888972268, "train/lr": 0.0008940053768033609, "train/loss": 4.984641075134277, "train/global_grad_norm": 0.486380398273468} +{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 1645.136862017971, "train/update_time": 816.84872851416, "train/lr": 0.0008840401786221159, "train/loss": 4.932381629943848, "train/global_grad_norm": 0.7352603077888489} +{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 1680.500316324993, "train/update_time": 852.0965187721886, "train/lr": 0.0008736882845346905, "train/loss": 4.874920845031738, "train/global_grad_norm": 0.5009574890136719} +{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 1792.9284177150112, "train/update_time": 887.3467938142712, "train/lr": 0.0008629601180209381, "train/loss": 4.870193958282471, "train/global_grad_norm": 0.6007606387138367} +{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 1828.322631730989, "train/update_time": 922.6015984143014, "train/lr": 0.0008518664814351503, "train/loss": 4.817152976989746, "train/global_grad_norm": 0.4920603632926941} +{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 1940.5543955019675, "train/update_time": 957.8616747342749, "train/lr": 0.0008404185451290017, "train/loss": 4.805738925933838, "train/global_grad_norm": 0.7058248519897461} +{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 1975.955018882989, "train/update_time": 993.1225579883321, "train/lr": 0.0008286278362039527, "train/loss": 4.739347457885742, "train/global_grad_norm": 0.5552049279212952} +{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2087.7305572130135, "train/update_time": 1028.3721907203435, "train/lr": 0.0008165062269044352, "train/loss": 4.710965633392334, "train/global_grad_norm": 0.6555073261260986} +{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 2123.1104451339925, "train/update_time": 1063.6117942532874, "train/lr": 0.0008040659226635089, "train/loss": 4.675158500671387, "train/global_grad_norm": 0.54646235704422} +{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 2233.8338464839617, "train/update_time": 1098.8516714693396, "train/lr": 0.0007913194498130252, "train/loss": 4.6881422996521, "train/global_grad_norm": 0.4385336637496948} +{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 2269.22137821099, "train/update_time": 1134.096047840314, "train/lr": 0.000778279642970672, "train/loss": 4.613880157470703, "train/global_grad_norm": 0.6974025368690491} +{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 2379.2475059829885, "train/update_time": 1169.336060541391, "train/lr": 0.0007649596321166025, "train/loss": 4.6237874031066895, "train/global_grad_norm": 0.623008131980896} +{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 2414.640490865975, "train/update_time": 1204.5815564935328, "train/lr": 0.0007513728293726579, "train/loss": 4.571319580078125, "train/global_grad_norm": 0.8349814414978027} +{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 2525.280265790003, "train/update_time": 1239.831252818578, "train/lr": 0.0007375329154974975, "train/loss": 4.533912181854248, "train/global_grad_norm": 0.5782769322395325} +{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 2560.696846612962, "train/update_time": 1275.0923811426037, "train/lr": 0.0007234538261112341, "train/loss": 4.453745365142822, "train/global_grad_norm": 0.5227785706520081} +{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 2670.3778264659923, "train/update_time": 1310.3158643786446, "train/lr": 0.0007091497376634464, "train/loss": 4.4719929695129395, "train/global_grad_norm": 1.180794596672058} +{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 2705.7711879389826, "train/update_time": 1345.540644572582, "train/lr": 0.0006946350531586958, "train/loss": 4.423107624053955, "train/global_grad_norm": 0.6167490482330322} +{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 2815.635458876961, "train/update_time": 1380.7573516704724, "train/lr": 0.0006799243876539214, "train/loss": 4.413957595825195, "train/global_grad_norm": 0.8560699820518494} +{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 2851.024827848014, "train/update_time": 1415.9805577184306, "train/lr": 0.0006650325535423166, "train/loss": 4.290282249450684, "train/global_grad_norm": 0.5771048665046692} +{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 2961.242350918008, "train/update_time": 1451.2064463463612, "train/lr": 0.0006499745456385053, "train/loss": 4.317328929901123, "train/global_grad_norm": 0.8895323276519775} +{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 2996.6310340749915, "train/update_time": 1486.4294549234328, "train/lr": 0.0006347655260800339, "train/loss": 4.294151782989502, "train/global_grad_norm": 0.5844022035598755} +{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 3107.195180976996, "train/update_time": 1521.6597315414692, "train/lr": 0.0006194208090603844, "train/loss": 4.289255142211914, "train/global_grad_norm": 0.7929869890213013} +{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 3142.596222635999, "train/update_time": 1556.8869810854085, "train/lr": 0.0006039558454088796, "train/loss": 4.304691791534424, "train/global_grad_norm": 0.7291324138641357} +{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 3252.3157649660134, "train/update_time": 1592.1117133093649, "train/lr": 0.0005883862070330078, "train/loss": 4.23949670791626, "train/global_grad_norm": 0.66145920753479} +{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 3287.7011176730157, "train/update_time": 1627.3387761343038, "train/lr": 0.0005727275712388317, "train/loss": 4.2027587890625, "train/global_grad_norm": 0.7958371043205261} +{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 3397.5104132720153, "train/update_time": 1662.5627456933726, "train/lr": 0.0005569957049452703, "train/loss": 4.191956043243408, "train/global_grad_norm": 0.7229559421539307} +{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 3432.890017031983, "train/update_time": 1697.7829640183481, "train/lr": 0.0005412064488081482, "train/loss": 4.181127548217773, "train/global_grad_norm": 0.9045247435569763} +{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 3542.651402093994, "train/update_time": 1733.0092777723912, "train/lr": 0.0005253757012699972, "train/loss": 4.1732048988342285, "train/global_grad_norm": 0.5767119526863098} +{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 3578.0405803910107, "train/update_time": 1768.2423270724248, "train/lr": 0.0005095194025516734, "train/loss": 4.142397403717041, "train/global_grad_norm": 0.7807267904281616} +{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 3688.537352617015, "train/update_time": 1803.4673421693733, "train/lr": 0.0004936535186019053, "train/loss": 4.12105131149292, "train/global_grad_norm": 0.6827822327613831} +{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 3723.92296086601, "train/update_time": 1838.693349173409, "train/lr": 0.00047779402502093696, "train/loss": 4.119822978973389, "train/global_grad_norm": 1.0019612312316895} +{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 3833.784007055976, "train/update_time": 1873.921633931459, "train/lr": 0.0004619568909744525, "train/loss": 4.105185508728027, "train/global_grad_norm": 1.0723958015441895} +{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 3869.1690179569996, "train/update_time": 1909.1470028955955, "train/lr": 0.00044615806311398067, "train/loss": 4.104675769805908, "train/global_grad_norm": 0.6603816151618958} +{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 3979.0184022249887, "train/update_time": 1944.3775492714485, "train/lr": 0.0004304134495199673, "train/loss": 4.048872947692871, "train/global_grad_norm": 0.6948413252830505} +{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 4014.40222099697, "train/update_time": 1979.6119488844415, "train/lr": 0.0004147389036836882, "train/loss": 4.09351110458374, "train/global_grad_norm": 0.571071207523346} +{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 4124.598098332004, "train/update_time": 2014.8426124633406, "train/lr": 0.0003991502085441259, "train/loss": 4.016101360321045, "train/global_grad_norm": 0.6385037302970886} +{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 4159.974407147965, "train/update_time": 2050.0642815562896, "train/lr": 0.0003836630605958888, "train/loss": 4.078490734100342, "train/global_grad_norm": 0.8873701691627502} +{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 4270.076159979973, "train/update_time": 2085.291462272231, "train/lr": 0.00036829305408417155, "train/loss": 4.081883430480957, "train/global_grad_norm": 0.699393630027771} +{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 4305.46517852298, "train/update_time": 2120.5240447262186, "train/lr": 0.000353055665302672, "train/loss": 4.077620029449463, "train/global_grad_norm": 0.5510751605033875} +{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 4416.261696751986, "train/update_time": 2155.7523099503014, "train/lr": 0.0003379662370102746, "train/loss": 4.03539514541626, "train/global_grad_norm": 0.4872359335422516} +{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 4451.645092452003, "train/update_time": 2190.979344193125, "train/lr": 0.00032303996298219405, "train/loss": 4.0130109786987305, "train/global_grad_norm": 0.7172174453735352} +{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 4562.018341306015, "train/update_time": 2226.215099543275, "train/lr": 0.00030829187271113034, "train/loss": 4.032833099365234, "train/global_grad_norm": 1.3824615478515625} +{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 4597.405511869991, "train/update_time": 2261.444451206189, "train/lr": 0.0002937368162738445, "train/loss": 4.016963958740234, "train/global_grad_norm": 0.6392601132392883} +{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 4707.963436296966, "train/update_time": 2296.670486221323, "train/lr": 0.0002793894493783894, "train/loss": 3.9857394695281982, "train/global_grad_norm": 0.5544307231903076} +{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 4743.360288269003, "train/update_time": 2331.894208611455, "train/lr": 0.00026526421860705474, "train/loss": 4.021228313446045, "train/global_grad_norm": 0.5490173101425171} +{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 4853.4431308630155, "train/update_time": 2367.121892035415, "train/lr": 0.0002513753468698824, "train/loss": 3.9647693634033203, "train/global_grad_norm": 1.3666198253631592} +{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 4888.828857862973, "train/update_time": 2402.3454629034386, "train/lr": 0.00023773681908340283, "train/loss": 3.980109691619873, "train/global_grad_norm": 0.8777939081192017} +{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 4998.851999056002, "train/update_time": 2437.5750640144106, "train/lr": 0.00022436236808900823, "train/loss": 3.9890389442443848, "train/global_grad_norm": 0.5416154265403748} +{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 5034.233835026971, "train/update_time": 2472.8065605463926, "train/lr": 0.00021126546082514682, "train/loss": 3.9841649532318115, "train/global_grad_norm": 0.45246919989585876} +{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 5144.364876898995, "train/update_time": 2508.037551375397, "train/lr": 0.00019845928476725522, "train/loss": 3.9905643463134766, "train/global_grad_norm": 0.6189930438995361} +{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 5179.741817264992, "train/update_time": 2543.2725221014116, "train/lr": 0.0001859567346490913, "train/loss": 3.959702491760254, "train/global_grad_norm": 0.4403487741947174} +{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 5289.151538159, "train/update_time": 2578.4985305793816, "train/lr": 0.00017377039947882782, "train/loss": 3.9835543632507324, "train/global_grad_norm": 0.47642815113067627} +{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 5324.524823835993, "train/update_time": 2613.7285769192385, "train/lr": 0.00016191254986299043, "train/loss": 3.9557220935821533, "train/global_grad_norm": 1.0352903604507446} +{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 5434.649355244997, "train/update_time": 2648.9550557791954, "train/lr": 0.00015039512565099468, "train/loss": 3.951629877090454, "train/global_grad_norm": 0.3987804055213928} +{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 5470.030363440979, "train/update_time": 2684.182179984171, "train/lr": 0.00013922972391273224, "train/loss": 3.9082868099212646, "train/global_grad_norm": 0.39810803532600403} +{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 5579.999253949965, "train/update_time": 2719.416021748213, "train/lr": 0.00012842758726130281, "train/loss": 3.979860305786133, "train/global_grad_norm": 0.43757563829421997} +{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 5615.37508792599, "train/update_time": 2754.6453251581406, "train/lr": 0.00011799959253265679, "train/loss": 3.917025566101074, "train/global_grad_norm": 0.4092726707458496} +{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 5724.889530220011, "train/update_time": 2789.872535758186, "train/lr": 0.00010795623983354214, "train/loss": 3.9336912631988525, "train/global_grad_norm": 0.445840060710907} +{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 5760.269799852977, "train/update_time": 2825.100109060295, "train/lr": 9.830764196878872e-05, "train/loss": 3.918724298477173, "train/global_grad_norm": 0.3501419723033905} +{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 5870.409012118005, "train/update_time": 2860.3205174013856, "train/lr": 8.906351425856951e-05, "train/loss": 3.8968920707702637, "train/global_grad_norm": 0.39583295583724976} +{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 5905.784063692961, "train/update_time": 2895.5508844144642, "train/lr": 8.02331647558977e-05, "train/loss": 3.9078526496887207, "train/global_grad_norm": 0.32103216648101807} +{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 6015.045884961961, "train/update_time": 2930.7809853444924, "train/lr": 7.182548487420554e-05, "train/loss": 3.9445602893829346, "train/global_grad_norm": 0.2946698069572449} +{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 6050.427405752998, "train/update_time": 2966.012128848466, "train/lr": 6.384894043444556e-05, "train/loss": 3.87947940826416, "train/global_grad_norm": 0.35237979888916016} +{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 6159.759159684996, "train/update_time": 3001.234070145467, "train/lr": 5.6311563140726166e-05, "train/loss": 3.9575512409210205, "train/global_grad_norm": 0.3177047371864319} +{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 6195.132035595016, "train/update_time": 3036.459684428468, "train/lr": 4.922094249306547e-05, "train/loss": 3.9432151317596436, "train/global_grad_norm": 0.3355129659175873} +{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 6304.553084079002, "train/update_time": 3071.6873910723953, "train/lr": 4.2584218145409916e-05, "train/loss": 3.884796380996704, "train/global_grad_norm": 0.2935344874858856} +{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 6339.925039035967, "train/update_time": 3106.918331800378, "train/lr": 3.6408072716606236e-05, "train/loss": 3.8955740928649902, "train/global_grad_norm": 0.27834922075271606} +{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 6449.384100833966, "train/update_time": 3142.1480970325065, "train/lr": 3.069872506157217e-05, "train/loss": 3.967247486114502, "train/global_grad_norm": 0.3791719973087311} +{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 6484.7479819079745, "train/update_time": 3177.374425999529, "train/lr": 2.5461924009435368e-05, "train/loss": 3.8741910457611084, "train/global_grad_norm": 0.28777578473091125} +{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 6595.026013074967, "train/update_time": 3212.6112683996907, "train/lr": 2.0702942574950812e-05, "train/loss": 3.9192233085632324, "train/global_grad_norm": 0.22005437314510345} +{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 6630.405630423978, "train/update_time": 3247.8434043628513, "train/lr": 1.642657264902142e-05, "train/loss": 3.934617280960083, "train/global_grad_norm": 0.25288376212120056} +{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 6740.169189045962, "train/update_time": 3283.0864938918385, "train/lr": 1.2637120173670358e-05, "train/loss": 3.9254136085510254, "train/global_grad_norm": 0.22438837587833405} +{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 6775.5438796009985, "train/update_time": 3318.3166800447507, "train/lr": 9.338400806321978e-06, "train/loss": 3.887643575668335, "train/global_grad_norm": 0.22849982976913452} +{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 6886.654366842995, "train/update_time": 3353.5464223485906, "train/lr": 6.533736077758867e-06, "train/loss": 3.896052598953247, "train/global_grad_norm": 0.22606733441352844} +{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 6922.033780722995, "train/update_time": 3388.7756955446093, "train/lr": 4.2259500476214406e-06, "train/loss": 3.8972818851470947, "train/global_grad_norm": 0.21224528551101685} +{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 7032.169445707987, "train/update_time": 3424.0070785805583, "train/lr": 2.417366460819359e-06, "train/loss": 3.9289143085479736, "train/global_grad_norm": 0.2368578314781189} +{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 7067.550158863014, "train/update_time": 3459.239731548645, "train/lr": 1.1098064077174619e-06, "train/loss": 3.8993091583251953, "train/global_grad_norm": 0.20862537622451782} diff --git a/metrics/jsonlines/train_data_info.jsonl b/metrics/jsonlines/train_data_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8f2d82aa96124ce343d2b13b9c030bd8eadda7c6 --- /dev/null +++ b/metrics/jsonlines/train_data_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024} diff --git a/metrics/jsonlines/train_eval.jsonl b/metrics/jsonlines/train_eval.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9c5e0d2932fdff37a05bf4192714e2b0b928a5df --- /dev/null +++ b/metrics/jsonlines/train_eval.jsonl @@ -0,0 +1,19 @@ +{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 333.5220514299581, "train_eval/train_update_time": 182.68076597800246, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.218650683912129, "train_eval/perplexity_len_2048": 3709.4937245794213, "train_eval/loss_avg_len_1024": 8.219004572855848, "train_eval/perplexity_len_1024": 3710.8067057063945, "train_eval/loss_avg_len_512": 8.21855547720159, "train_eval/perplexity_len_512": 3709.1405726956223} +{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 659.4609651200008, "train_eval/train_update_time": 358.814332414011, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.337175440577739, "train_eval/perplexity_len_2048": 565.197620381525, "train_eval/loss_avg_len_1024": 6.339329850406502, "train_eval/perplexity_len_1024": 566.4166003097482, "train_eval/loss_avg_len_512": 6.341749459894636, "train_eval/perplexity_len_512": 567.7887666740248} +{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1061.4905991089763, "train_eval/train_update_time": 534.9619731742423, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.623096892892063, "train_eval/perplexity_len_2048": 276.7451074639398, "train_eval/loss_avg_len_1024": 5.627105757865829, "train_eval/perplexity_len_1024": 277.85676799159836, "train_eval/loss_avg_len_512": 5.634965539031764, "train_eval/perplexity_len_512": 280.0492663752382} +{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1388.2071847780026, "train_eval/train_update_time": 711.1109357241658, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.218137195683321, "train_eval/perplexity_len_2048": 184.59000853965267, "train_eval/loss_avg_len_1024": 5.225316365856779, "train_eval/perplexity_len_1024": 185.91997995656686, "train_eval/loss_avg_len_512": 5.237964040004735, "train_eval/perplexity_len_512": 188.28636839096927} +{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1792.9284177150112, "train_eval/train_update_time": 887.3467938142712, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.9624374085418275, "train_eval/perplexity_len_2048": 142.94177914951652, "train_eval/loss_avg_len_1024": 4.970113305729392, "train_eval/perplexity_len_1024": 144.04320736700222, "train_eval/loss_avg_len_512": 4.987048855513057, "train_eval/perplexity_len_512": 146.50343210389644} +{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2123.1104451339925, "train_eval/train_update_time": 1063.6117942532874, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.778798732913674, "train_eval/perplexity_len_2048": 118.96135981119248, "train_eval/loss_avg_len_1024": 4.788676094550828, "train_eval/perplexity_len_1024": 120.14220640683598, "train_eval/loss_avg_len_512": 4.8105151809382365, "train_eval/perplexity_len_512": 122.79486279711149} +{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2525.280265790003, "train_eval/train_update_time": 1239.831252818578, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.612123140184613, "train_eval/perplexity_len_2048": 100.69771821018394, "train_eval/loss_avg_len_1024": 4.629803386812637, "train_eval/perplexity_len_1024": 102.49391047489935, "train_eval/loss_avg_len_512": 4.662501077103371, "train_eval/perplexity_len_512": 105.90061681592375} +{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2851.024827848014, "train_eval/train_update_time": 1415.9805577184306, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.448032398842888, "train_eval/perplexity_len_2048": 85.45862996984596, "train_eval/loss_avg_len_1024": 4.474290047360373, "train_eval/perplexity_len_1024": 87.73229252189076, "train_eval/loss_avg_len_512": 4.523296486837353, "train_eval/perplexity_len_512": 92.13883234754633} +{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3252.3157649660134, "train_eval/train_update_time": 1592.1117133093649, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.298628560911212, "train_eval/perplexity_len_2048": 73.59878819895772, "train_eval/loss_avg_len_1024": 4.338705009182432, "train_eval/perplexity_len_1024": 76.60826807077736, "train_eval/loss_avg_len_512": 4.404042768209729, "train_eval/perplexity_len_512": 81.78082216081769} +{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3578.0405803910107, "train_eval/train_update_time": 1768.2423270724248, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.191409433552872, "train_eval/perplexity_len_2048": 66.11591130529773, "train_eval/loss_avg_len_1024": 4.236702718509605, "train_eval/perplexity_len_1024": 69.17937150259523, "train_eval/loss_avg_len_512": 4.31055497860878, "train_eval/perplexity_len_512": 74.48181328533752} +{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3979.0184022249887, "train_eval/train_update_time": 1944.3775492714485, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.112142655201096, "train_eval/perplexity_len_2048": 61.077445373302744, "train_eval/loss_avg_len_1024": 4.161546632732825, "train_eval/perplexity_len_1024": 64.17069438542758, "train_eval/loss_avg_len_512": 4.239235591167089, "train_eval/perplexity_len_512": 69.35481613679829} +{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4305.46517852298, "train_eval/train_update_time": 2120.5240447262186, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.053491315687242, "train_eval/perplexity_len_2048": 57.59819991208716, "train_eval/loss_avg_len_1024": 4.104046444938321, "train_eval/perplexity_len_1024": 60.58494591305834, "train_eval/loss_avg_len_512": 4.186936804189099, "train_eval/perplexity_len_512": 65.82085965917027} +{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4707.963436296966, "train_eval/train_update_time": 2296.670486221323, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.014058288484093, "train_eval/perplexity_len_2048": 55.371127214510835, "train_eval/loss_avg_len_1024": 4.071433403507817, "train_eval/perplexity_len_1024": 58.6409585331353, "train_eval/loss_avg_len_512": 4.15653280348517, "train_eval/perplexity_len_512": 63.849758711728725} +{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5034.233835026971, "train_eval/train_update_time": 2472.8065605463926, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9816232878798563, "train_eval/perplexity_len_2048": 53.60397832907974, "train_eval/loss_avg_len_1024": 4.040070988443676, "train_eval/perplexity_len_1024": 56.83037696229216, "train_eval/loss_avg_len_512": 4.127506753519919, "train_eval/perplexity_len_512": 62.023091144379585} +{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5434.649355244997, "train_eval/train_update_time": 2648.9550557791954, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.957951218417593, "train_eval/perplexity_len_2048": 52.34996236565334, "train_eval/loss_avg_len_1024": 4.017085574939046, "train_eval/perplexity_len_1024": 55.53900545747886, "train_eval/loss_avg_len_512": 4.105732457925805, "train_eval/perplexity_len_512": 60.6871790777178} +{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5760.269799852977, "train_eval/train_update_time": 2825.100109060295, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9413603520746525, "train_eval/perplexity_len_2048": 51.48859629960878, "train_eval/loss_avg_len_1024": 4.002746569575137, "train_eval/perplexity_len_1024": 54.74831377406664, "train_eval/loss_avg_len_512": 4.091508979080463, "train_eval/perplexity_len_512": 59.83010599961991} +{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6159.759159684996, "train_eval/train_update_time": 3001.234070145467, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.922204242174694, "train_eval/perplexity_len_2048": 50.51166209398698, "train_eval/loss_avg_len_1024": 3.977446037822138, "train_eval/perplexity_len_1024": 53.3805281360993, "train_eval/loss_avg_len_512": 4.066942860077361, "train_eval/perplexity_len_512": 58.37821912491155} +{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6484.7479819079745, "train_eval/train_update_time": 3177.374425999529, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.919837444722207, "train_eval/perplexity_len_2048": 50.39225258562379, "train_eval/loss_avg_len_1024": 3.9784435200289954, "train_eval/perplexity_len_1024": 53.43380082797178, "train_eval/loss_avg_len_512": 4.068381745212573, "train_eval/perplexity_len_512": 58.462279138479424} +{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6886.654366842995, "train_eval/train_update_time": 3353.5464223485906, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.9090868059578225, "train_eval/perplexity_len_2048": 49.853405347373815, "train_eval/loss_avg_len_1024": 3.970852813320653, "train_eval/perplexity_len_1024": 53.02973602641771, "train_eval/loss_avg_len_512": 4.06019487478894, "train_eval/perplexity_len_512": 57.98560991148779} diff --git a/metrics/jsonlines/val.jsonl b/metrics/jsonlines/val.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..19f25b85c318ae48f48bb703ffd954d31054b8de --- /dev/null +++ b/metrics/jsonlines/val.jsonl @@ -0,0 +1,49 @@ +{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 77.36710998200579, "val/train_update_time": 76.98762350907782, "val/loss": 7.967406083543483, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.46969969500788, "val/val_tokens_per_second": 542734.3710857432, "val/loss_avg_len_2048": 7.967406083543483, "val/perplexity_len_2048": 2885.3632576421255, "val/loss_avg_len_1024": 7.965887772902428, "val/perplexity_len_1024": 2880.9857039888375, "val/loss_avg_len_512": 7.965943124503736, "val/perplexity_len_512": 2881.1451755743637} +{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 223.6157359869685, "val/train_update_time": 147.44638093106914, "val/loss": 7.078002173094731, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.52392348198919, "val/val_tokens_per_second": 549622.1627394476, "val/loss_avg_len_2048": 7.078002173094731, "val/perplexity_len_2048": 1185.5975322388967, "val/loss_avg_len_1024": 7.076746983557055, "val/perplexity_len_1024": 1184.1103161848112, "val/loss_avg_len_512": 7.077828987323959, "val/perplexity_len_512": 1185.3922213954218} +{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 368.9031989739742, "val/train_update_time": 217.90960857702885, "val/loss": 6.593350867911941, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.5729669869761, "val/val_tokens_per_second": 549260.6993517304, "val/loss_avg_len_2048": 6.593350867911941, "val/perplexity_len_2048": 730.2236579069736, "val/loss_avg_len_1024": 6.592371494894009, "val/perplexity_len_1024": 729.5088466499291, "val/loss_avg_len_512": 6.59457370796809, "val/perplexity_len_512": 731.1171508339412} +{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 514.2439901359612, "val/train_update_time": 288.3617576470133, "val/loss": 6.2121616636668335, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.46951410500333, "val/val_tokens_per_second": 550023.7310833756, "val/loss_avg_len_2048": 6.2121616636668335, "val/perplexity_len_2048": 498.77827766365175, "val/loss_avg_len_1024": 6.212113584863767, "val/perplexity_len_1024": 498.7542975775375, "val/loss_avg_len_512": 6.21575433148481, "val/perplexity_len_512": 500.5734451194017} +{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 659.4609651200008, "val/train_update_time": 358.814332414011, "val/loss": 5.9063440953444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.66056790604489, "val/val_tokens_per_second": 548616.2394524683, "val/loss_avg_len_2048": 5.9063440953444, "val/perplexity_len_2048": 367.3606618559352, "val/loss_avg_len_1024": 5.907849762643875, "val/perplexity_len_1024": 367.9142014102337, "val/loss_avg_len_512": 5.914086231814698, "val/perplexity_len_512": 370.215856625777} +{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 805.5059107720153, "val/train_update_time": 429.2700012290734, "val/loss": 5.654363232167089, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.21082512801513, "val/val_tokens_per_second": 544602.4549030361, "val/loss_avg_len_2048": 5.654363232167089, "val/perplexity_len_2048": 285.534605573886, "val/loss_avg_len_1024": 5.657451409234992, "val/perplexity_len_1024": 286.41774994614394, "val/loss_avg_len_512": 5.666161814958416, "val/perplexity_len_512": 288.92346179539464} +{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 951.4780202080146, "val/train_update_time": 499.73381894716294, "val/loss": 5.471082407338126, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.61947528098244, "val/val_tokens_per_second": 548918.3600630208, "val/loss_avg_len_2048": 5.471082407338126, "val/perplexity_len_2048": 237.71736057137844, "val/loss_avg_len_1024": 5.47550174622531, "val/perplexity_len_1024": 238.7702389466763, "val/loss_avg_len_512": 5.486109551545978, "val/perplexity_len_512": 241.31654865962557} +{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1096.8764924500138, "val/train_update_time": 570.1910730601521, "val/loss": 5.306349960088963, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.72595568199176, "val/val_tokens_per_second": 548136.1814134812, "val/loss_avg_len_2048": 5.306349960088963, "val/perplexity_len_2048": 201.6129882749498, "val/loss_avg_len_1024": 5.312019983144896, "val/perplexity_len_1024": 202.75938554501306, "val/loss_avg_len_512": 5.324348847681005, "val/perplexity_len_512": 205.27465187158018} +{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1242.364284639014, "val/train_update_time": 640.6497024221462, "val/loss": 5.184559677005536, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.06923581298906, "val/val_tokens_per_second": 545629.6385118228, "val/loss_avg_len_2048": 5.184559677005536, "val/perplexity_len_2048": 178.49483710243962, "val/loss_avg_len_1024": 5.191356708838465, "val/perplexity_len_1024": 179.712204748735, "val/loss_avg_len_512": 5.2056767437635925, "val/perplexity_len_512": 182.30420426547653} +{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1388.2071847780026, "val/train_update_time": 711.1109357241658, "val/loss": 5.06955635641932, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.47714675101452, "val/val_tokens_per_second": 549967.362967514, "val/loss_avg_len_2048": 5.06955635641932, "val/perplexity_len_2048": 159.10372633676187, "val/loss_avg_len_1024": 5.077621888072462, "val/perplexity_len_1024": 160.39217149050972, "val/loss_avg_len_512": 5.09403516663406, "val/perplexity_len_512": 163.04645604872834} +{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 1534.1124866969767, "val/train_update_time": 781.5830888972268, "val/loss": 4.975544349937421, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.62435562204337, "val/val_tokens_per_second": 541624.4497303297, "val/loss_avg_len_2048": 4.975544349937421, "val/perplexity_len_2048": 144.82764061174217, "val/loss_avg_len_1024": 4.984868058896298, "val/perplexity_len_1024": 146.18428603763627, "val/loss_avg_len_512": 5.0030605386967775, "val/perplexity_len_512": 148.8680791141717} +{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 1680.500316324993, "val/train_update_time": 852.0965187721886, "val/loss": 4.895440164133744, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 77.0330984640168, "val/val_tokens_per_second": 531719.4922275257, "val/loss_avg_len_2048": 4.895440164133744, "val/perplexity_len_2048": 133.67883429497678, "val/loss_avg_len_1024": 4.9060379521952475, "val/perplexity_len_1024": 135.10306778148092, "val/loss_avg_len_512": 4.926302171780263, "val/perplexity_len_512": 137.86875360523445} +{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 1828.322631730989, "val/train_update_time": 922.6015984143014, "val/loss": 4.819560142311337, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 76.82710023201071, "val/val_tokens_per_second": 533145.2036625696, "val/loss_avg_len_2048": 4.819560142311337, "val/perplexity_len_2048": 123.91057577182025, "val/loss_avg_len_1024": 4.8313868719966155, "val/perplexity_len_1024": 125.38473270170651, "val/loss_avg_len_512": 4.853890417815373, "val/perplexity_len_512": 128.23832128707267} +{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 1975.955018882989, "val/train_update_time": 993.1225579883321, "val/loss": 4.752174282312161, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 76.37709301302675, "val/val_tokens_per_second": 536286.4490406035, "val/loss_avg_len_2048": 4.752174282312161, "val/perplexity_len_2048": 115.8358708032794, "val/loss_avg_len_1024": 4.765576768405783, "val/perplexity_len_1024": 117.39880968881805, "val/loss_avg_len_512": 4.790581661829166, "val/perplexity_len_512": 120.37136373211975} +{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 2123.1104451339925, "val/train_update_time": 1063.6117942532874, "val/loss": 4.689406985081849, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.71957862901036, "val/val_tokens_per_second": 548182.9629068199, "val/loss_avg_len_2048": 4.689406985081849, "val/perplexity_len_2048": 108.78864738413802, "val/loss_avg_len_1024": 4.704826454977226, "val/perplexity_len_1024": 110.47911018398477, "val/loss_avg_len_512": 4.733361966823415, "val/perplexity_len_512": 113.67709926837894} +{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 2269.22137821099, "val/train_update_time": 1134.096047840314, "val/loss": 4.626646096304082, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.62473024002975, "val/val_tokens_per_second": 548879.7060740124, "val/loss_avg_len_2048": 4.626646096304082, "val/perplexity_len_2048": 102.17081774137807, "val/loss_avg_len_1024": 4.644601592122112, "val/perplexity_len_1024": 104.02191437851869, "val/loss_avg_len_512": 4.677183249155526, "val/perplexity_len_512": 107.4669382558201} +{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 2414.640490865975, "val/train_update_time": 1204.5815564935328, "val/loss": 4.56134519904966, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.26538718299707, "val/val_tokens_per_second": 544207.6568398112, "val/loss_avg_len_2048": 4.56134519904966, "val/perplexity_len_2048": 95.71214515713238, "val/loss_avg_len_1024": 4.582940916776005, "val/perplexity_len_1024": 97.80159803916156, "val/loss_avg_len_512": 4.621313081837446, "val/perplexity_len_512": 101.62738963524231} +{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 2560.696846612962, "val/train_update_time": 1275.0923811426037, "val/loss": 4.491432455242611, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.28613866801606, "val/val_tokens_per_second": 551381.4654312534, "val/loss_avg_len_2048": 4.491432455242611, "val/perplexity_len_2048": 89.2491998401074, "val/loss_avg_len_1024": 4.517822659327136, "val/perplexity_len_1024": 91.63585812522578, "val/loss_avg_len_512": 4.562794736436661, "val/perplexity_len_512": 95.8509840917308} +{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 2705.7711879389826, "val/train_update_time": 1345.540644572582, "val/loss": 4.430973563183867, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.47222723404411, "val/val_tokens_per_second": 550003.692937434, "val/loss_avg_len_2048": 4.430973563183867, "val/perplexity_len_2048": 84.01316923681291, "val/loss_avg_len_1024": 4.462146958403895, "val/perplexity_len_1024": 86.67339365261898, "val/loss_avg_len_512": 4.513480652339757, "val/perplexity_len_512": 91.23883714663683} +{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 2851.024827848014, "val/train_update_time": 1415.9805577184306, "val/loss": 4.366317664692108, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.15898946800735, "val/val_tokens_per_second": 552326.8358136191, "val/loss_avg_len_2048": 4.366317664692108, "val/perplexity_len_2048": 78.7531017862609, "val/loss_avg_len_1024": 4.40270962580326, "val/perplexity_len_1024": 81.67186931972208, "val/loss_avg_len_512": 4.461183923999499, "val/perplexity_len_512": 86.58996437166769} +{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 2996.6310340749915, "val/train_update_time": 1486.4294549234328, "val/loss": 4.30756352697448, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.16067498200573, "val/val_tokens_per_second": 544965.834990256, "val/loss_avg_len_2048": 4.30756352697448, "val/perplexity_len_2048": 74.25933747198452, "val/loss_avg_len_1024": 4.348377340535215, "val/perplexity_len_1024": 77.35284371024551, "val/loss_avg_len_512": 4.4122849026547755, "val/perplexity_len_512": 82.45765613881858} +{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 3142.596222635999, "val/train_update_time": 1556.8869810854085, "val/loss": 4.260864559826906, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.31724469002802, "val/val_tokens_per_second": 551150.6807180657, "val/loss_avg_len_2048": 4.260864559826906, "val/perplexity_len_2048": 70.87122939288916, "val/loss_avg_len_1024": 4.305188920119405, "val/perplexity_len_1024": 74.0832099397546, "val/loss_avg_len_512": 4.373845319927856, "val/perplexity_len_512": 79.34816489023441} +{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 3287.7011176730157, "val/train_update_time": 1627.3387761343038, "val/loss": 4.223266492512053, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.42460210697027, "val/val_tokens_per_second": 550355.6463913412, "val/loss_avg_len_2048": 4.223266492512053, "val/perplexity_len_2048": 68.25607850967396, "val/loss_avg_len_1024": 4.2713793615476225, "val/perplexity_len_1024": 71.62035788441096, "val/loss_avg_len_512": 4.34352318198774, "val/perplexity_len_512": 76.97827059691846} +{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 3432.890017031983, "val/train_update_time": 1697.7829640183481, "val/loss": 4.18504595848294, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.3746191120008, "val/val_tokens_per_second": 550725.5094418473, "val/loss_avg_len_2048": 4.18504595848294, "val/perplexity_len_2048": 65.69652015975821, "val/loss_avg_len_1024": 4.235277974668006, "val/perplexity_len_1024": 69.08087879917835, "val/loss_avg_len_512": 4.3104049167047265, "val/perplexity_len_512": 74.4706372411888} +{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 3578.0405803910107, "val/train_update_time": 1768.2423270724248, "val/loss": 4.149188061488955, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.34305373596726, "val/val_tokens_per_second": 550959.3424218395, "val/loss_avg_len_2048": 4.149188061488955, "val/perplexity_len_2048": 63.38251669396044, "val/loss_avg_len_1024": 4.201489808351174, "val/perplexity_len_1024": 66.78575493670466, "val/loss_avg_len_512": 4.279306911751535, "val/perplexity_len_512": 72.19038835440217} +{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 3723.92296086601, "val/train_update_time": 1838.693349173409, "val/loss": 4.128686446135375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.46465942100622, "val/val_tokens_per_second": 550059.5895889552, "val/loss_avg_len_2048": 4.128686446135375, "val/perplexity_len_2048": 62.096302501948784, "val/loss_avg_len_1024": 4.183405997985183, "val/perplexity_len_1024": 65.58886875800643, "val/loss_avg_len_512": 4.264364499987476, "val/perplexity_len_512": 71.11970903315873} +{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 3869.1690179569996, "val/train_update_time": 1909.1470028955955, "val/loss": 4.098170458009047, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.46465041401098, "val/val_tokens_per_second": 550059.6561223246, "val/loss_avg_len_2048": 4.098170458009047, "val/perplexity_len_2048": 60.229993432124694, "val/loss_avg_len_1024": 4.1536520937833465, "val/perplexity_len_1024": 63.66609076637863, "val/loss_avg_len_512": 4.235404257816635, "val/perplexity_len_512": 69.08960310091771} +{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 4014.40222099697, "val/train_update_time": 1979.6119488844415, "val/loss": 4.072737031718065, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.7946821290534, "val/val_tokens_per_second": 547632.5165648296, "val/loss_avg_len_2048": 4.072737031718065, "val/perplexity_len_2048": 58.717454391200924, "val/loss_avg_len_1024": 4.1293711922524965, "val/perplexity_len_1024": 62.138837265069576, "val/loss_avg_len_512": 4.2120003426606765, "val/perplexity_len_512": 67.4914108193186} +{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 4159.974407147965, "val/train_update_time": 2050.0642815562896, "val/loss": 4.053922182414727, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.71466798300389, "val/val_tokens_per_second": 548218.9924114712, "val/loss_avg_len_2048": 4.053922182414727, "val/perplexity_len_2048": 57.623022407201915, "val/loss_avg_len_1024": 4.111342828695057, "val/perplexity_len_1024": 61.02861354467073, "val/loss_avg_len_512": 4.195094090398866, "val/perplexity_len_512": 66.35997512008933} +{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 4305.46517852298, "val/train_update_time": 2120.5240447262186, "val/loss": 4.036319210487535, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.65241524501471, "val/val_tokens_per_second": 548676.1528822111, "val/loss_avg_len_2048": 4.036319210487535, "val/perplexity_len_2048": 56.61756147462313, "val/loss_avg_len_1024": 4.0947161903618845, "val/perplexity_len_1024": 60.022301832124555, "val/loss_avg_len_512": 4.179663519956824, "val/perplexity_len_512": 65.34386260880561} +{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 4451.645092452003, "val/train_update_time": 2190.979344193125, "val/loss": 4.018702121205884, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.97793793701567, "val/val_tokens_per_second": 546294.0316444547, "val/loss_avg_len_2048": 4.018702121205884, "val/perplexity_len_2048": 55.62885943640474, "val/loss_avg_len_1024": 4.077695212568948, "val/perplexity_len_1024": 59.00930908545214, "val/loss_avg_len_512": 4.163278880037181, "val/perplexity_len_512": 64.28195023129467} +{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 4597.405511869991, "val/train_update_time": 2261.444451206189, "val/loss": 4.006759247975774, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.16958679695381, "val/val_tokens_per_second": 544901.2259524602, "val/loss_avg_len_2048": 4.006759247975774, "val/perplexity_len_2048": 54.96844250775819, "val/loss_avg_len_1024": 4.067678513650177, "val/perplexity_len_1024": 58.421181071002465, "val/loss_avg_len_512": 4.1546929989802655, "val/perplexity_len_512": 63.73239563367866} +{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 4743.360288269003, "val/train_update_time": 2331.894208611455, "val/loss": 3.992493032265781, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.69443514704471, "val/val_tokens_per_second": 548367.4910904067, "val/loss_avg_len_2048": 3.992493032265781, "val/perplexity_len_2048": 54.189818067555, "val/loss_avg_len_1024": 4.053030727527477, "val/perplexity_len_1024": 57.57167697172892, "val/loss_avg_len_512": 4.140326388037205, "val/perplexity_len_512": 62.823322884355} +{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 4888.828857862973, "val/train_update_time": 2402.3454629034386, "val/loss": 3.9821310220196846, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.63417414203286, "val/val_tokens_per_second": 548810.2530893007, "val/loss_avg_len_2048": 3.9821310220196846, "val/perplexity_len_2048": 53.631201809468386, "val/loss_avg_len_1024": 4.042898476167862, "val/perplexity_len_1024": 56.99129154030574, "val/loss_avg_len_512": 4.13046144102756, "val/perplexity_len_512": 62.20662100013452} +{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 5034.233835026971, "val/train_update_time": 2472.8065605463926, "val/loss": 3.9720667837841903, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.0003811760107, "val/val_tokens_per_second": 553510.6623650518, "val/loss_avg_len_2048": 3.9720667837841903, "val/perplexity_len_2048": 53.09415165106492, "val/loss_avg_len_1024": 4.034272228109325, "val/perplexity_len_1024": 56.501784860758214, "val/loss_avg_len_512": 4.122892943188642, "val/perplexity_len_512": 61.73758750197274} +{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 5179.741817264992, "val/train_update_time": 2543.2725221014116, "val/loss": 3.9630798968119314, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.03690093499608, "val/val_tokens_per_second": 553237.6353240745, "val/loss_avg_len_2048": 3.9630798968119314, "val/perplexity_len_2048": 52.619138154590686, "val/loss_avg_len_1024": 4.025157931512734, "val/perplexity_len_1024": 55.98915053467066, "val/loss_avg_len_512": 4.1138680682414215, "val/perplexity_len_512": 61.18292016164027} +{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 5324.524823835993, "val/train_update_time": 2613.7285769192385, "val/loss": 3.9559251724027797, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.74004035303369, "val/val_tokens_per_second": 548032.8858069374, "val/loss_avg_len_2048": 3.9559251724027797, "val/perplexity_len_2048": 52.24400630519839, "val/loss_avg_len_1024": 4.017918658130476, "val/perplexity_len_1024": 55.58529334754871, "val/loss_avg_len_512": 4.106723456612043, "val/perplexity_len_512": 60.74734980208532} +{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 5470.030363440979, "val/train_update_time": 2684.182179984171, "val/loss": 3.94825802866444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.59196932800114, "val/val_tokens_per_second": 549120.7749173072, "val/loss_avg_len_2048": 3.94825802866444, "val/perplexity_len_2048": 51.84497566677264, "val/loss_avg_len_1024": 4.0108845364942685, "val/perplexity_len_1024": 55.195671562972564, "val/loss_avg_len_512": 4.100306030388736, "val/perplexity_len_512": 60.35875638489279} +{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 5615.37508792599, "val/train_update_time": 2754.6453251581406, "val/loss": 3.9421704971529783, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.13637772900984, "val/val_tokens_per_second": 552495.2965697999, "val/loss_avg_len_2048": 3.9421704971529783, "val/perplexity_len_2048": 51.53032643393987, "val/loss_avg_len_1024": 4.005375609245478, "val/perplexity_len_1024": 54.89243863483703, "val/loss_avg_len_512": 4.095269862245862, "val/perplexity_len_512": 60.05554369475379} +{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 5760.269799852977, "val/train_update_time": 2825.100109060295, "val/loss": 3.9371402649512284, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.03845945402281, "val/val_tokens_per_second": 553225.9896011987, "val/loss_avg_len_2048": 3.9371402649512284, "val/perplexity_len_2048": 51.271767776784145, "val/loss_avg_len_1024": 4.000664330457105, "val/perplexity_len_1024": 54.63443329781732, "val/loss_avg_len_512": 4.090735746535101, "val/perplexity_len_512": 59.78386129572768} +{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 5905.784063692961, "val/train_update_time": 2895.5508844144642, "val/loss": 3.932787389914203, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 73.87577835295815, "val/val_tokens_per_second": 554444.2429331084, "val/loss_avg_len_2048": 3.932787389914203, "val/perplexity_len_2048": 51.04907321115793, "val/loss_avg_len_1024": 3.9963908482754142, "val/perplexity_len_1024": 54.40145219547801, "val/loss_avg_len_512": 4.086696723492723, "val/perplexity_len_512": 59.54287989471592} +{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 6050.427405752998, "val/train_update_time": 2966.012128848466, "val/loss": 3.9291096620217902, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 73.96407199202804, "val/val_tokens_per_second": 553782.3824033746, "val/loss_avg_len_2048": 3.9291096620217902, "val/perplexity_len_2048": 50.86167342466414, "val/loss_avg_len_1024": 3.992870327184024, "val/perplexity_len_1024": 54.210267468017264, "val/loss_avg_len_512": 4.083310494290107, "val/perplexity_len_512": 59.34159504666062} +{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 6195.132035595016, "val/train_update_time": 3036.459684428468, "val/loss": 3.9261094075139615, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.03585776797263, "val/val_tokens_per_second": 553245.4304557135, "val/loss_avg_len_2048": 3.9261094075139615, "val/perplexity_len_2048": 50.70930414729804, "val/loss_avg_len_1024": 3.990283828539308, "val/perplexity_len_1024": 54.070233861117224, "val/loss_avg_len_512": 4.080950855814386, "val/perplexity_len_512": 59.20173540976174} +{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 6339.925039035967, "val/train_update_time": 3106.918331800378, "val/loss": 3.923576743226429, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.07839193200925, "val/val_tokens_per_second": 552927.769241994, "val/loss_avg_len_2048": 3.923576743226429, "val/perplexity_len_2048": 50.58103700101182, "val/loss_avg_len_1024": 3.9875462732635443, "val/perplexity_len_1024": 53.92241602920654, "val/loss_avg_len_512": 4.07817962241387, "val/perplexity_len_512": 59.03790070018595} +{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 6484.7479819079745, "val/train_update_time": 3177.374425999529, "val/loss": 3.9217875044056916, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.16108567302581, "val/val_tokens_per_second": 552311.22398331, "val/loss_avg_len_2048": 3.9217875044056916, "val/perplexity_len_2048": 50.49061636219757, "val/loss_avg_len_1024": 3.9858806951731913, "val/perplexity_len_1024": 53.83267878742668, "val/loss_avg_len_512": 4.076671612372063, "val/perplexity_len_512": 58.94893804822824} +{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 6630.405630423978, "val/train_update_time": 3247.8434043628513, "val/loss": 3.9204105864164656, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.36959133000346, "val/val_tokens_per_second": 550762.7414307871, "val/loss_avg_len_2048": 3.9204105864164656, "val/perplexity_len_2048": 50.42114276494054, "val/loss_avg_len_1024": 3.9845006187881813, "val/perplexity_len_1024": 53.75843682026665, "val/loss_avg_len_512": 4.075279883826617, "val/perplexity_len_512": 58.866954191292706} +{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 6775.5438796009985, "val/train_update_time": 3318.3166800447507, "val/loss": 3.9195347058722283, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 75.7266165559995, "val/val_tokens_per_second": 540893.0421407414, "val/loss_avg_len_2048": 3.9195347058722283, "val/perplexity_len_2048": 50.37699920204059, "val/loss_avg_len_1024": 3.983747386692418, "val/perplexity_len_1024": 53.71795948656276, "val/loss_avg_len_512": 4.074615421833098, "val/perplexity_len_512": 58.8278523298474} +{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 6922.033780722995, "val/train_update_time": 3388.7756955446093, "val/loss": 3.919046544265724, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 74.74806076998357, "val/val_tokens_per_second": 547974.0822446625, "val/loss_avg_len_2048": 3.919046544265724, "val/perplexity_len_2048": 50.35241308666628, "val/loss_avg_len_1024": 3.9832005327720195, "val/perplexity_len_1024": 53.68859164051448, "val/loss_avg_len_512": 4.074043546199799, "val/perplexity_len_512": 58.79421973228877} +{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 7067.550158863014, "val/train_update_time": 3459.239731548645, "val/loss": 3.9187872609187617, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 73.89566232298966, "val/val_tokens_per_second": 554295.0521367333, "val/loss_avg_len_2048": 3.9187872609187617, "val/perplexity_len_2048": 50.33935923686961, "val/loss_avg_len_1024": 3.9829653370987623, "val/perplexity_len_1024": 53.67596580088699, "val/loss_avg_len_512": 4.073823164884653, "val/perplexity_len_512": 58.78126401247289} diff --git a/metrics/jsonlines/val_data_info.jsonl b/metrics/jsonlines/val_data_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d2a41d2d94f5b2005b74e9163cd291dacf51e5d --- /dev/null +++ b/metrics/jsonlines/val_data_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "val_data_info/vocab_size": 50277, "val_data_info/global_tokens_per_batch": 2048, "val_data_info/local_tokens_per_batch": 2048, "val_data_info/batch_len": 2048, "val_data_info/seq_len": 2048, "val_data_info/total_tokens": 2147483648, "val_data_info/global_batch_size": 1, "val_data_info/local_batch_size": 1} diff --git a/metrics/npz/train_eval/step-000000104857600.npz b/metrics/npz/train_eval/step-000000104857600.npz new file mode 100644 index 0000000000000000000000000000000000000000..10ab39949f5d2a399d0c9b39025921acbdf8a495 --- /dev/null +++ b/metrics/npz/train_eval/step-000000104857600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:911e3d478c0409f505fdef81e34a1f42902fcecf876b4035196c3060b9cb850e +size 20540 diff --git a/metrics/npz/train_eval/step-000000209715200.npz b/metrics/npz/train_eval/step-000000209715200.npz new file mode 100644 index 0000000000000000000000000000000000000000..0dc3f083d0e675b0dcfb55cc300e3d093c75d074 --- /dev/null +++ b/metrics/npz/train_eval/step-000000209715200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3bb71425e9ac7519e9e304cf86d7cc6309f3793475584fa52f3fac2acecb8bc +size 20540 diff --git a/metrics/npz/train_eval/step-000000314572800.npz b/metrics/npz/train_eval/step-000000314572800.npz new file mode 100644 index 0000000000000000000000000000000000000000..729c5595afc915ed7940ea3dc7522f85b15be89e --- /dev/null +++ b/metrics/npz/train_eval/step-000000314572800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62210377d439ae775ff2281b43d9a287fb8da111226f2f9f0231f9109cc2695e +size 20540 diff --git a/metrics/npz/train_eval/step-000000419430400.npz b/metrics/npz/train_eval/step-000000419430400.npz new file mode 100644 index 0000000000000000000000000000000000000000..79fb9b2b8eba90397959d9b7cbbd5df565548c68 --- /dev/null +++ b/metrics/npz/train_eval/step-000000419430400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e40c47b291edbb5b33ef9bd9cbc73c8634893730a13e2b5b4f2af1b81ec22d +size 20540 diff --git a/metrics/npz/train_eval/step-000000524288000.npz b/metrics/npz/train_eval/step-000000524288000.npz new file mode 100644 index 0000000000000000000000000000000000000000..b6ec7b88d955a6ec2b16581cb4305e14750d7d8e --- /dev/null +++ b/metrics/npz/train_eval/step-000000524288000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f2140711c755ac69013321e174878cf16ef347e63a4b5dd79cc49acbd70c816 +size 20540 diff --git a/metrics/npz/train_eval/step-000000629145600.npz b/metrics/npz/train_eval/step-000000629145600.npz new file mode 100644 index 0000000000000000000000000000000000000000..e4351c87e03b5833b2a574502002dee90fdd63fa --- /dev/null +++ b/metrics/npz/train_eval/step-000000629145600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c5ae758470a4e36181de891dee2a7a65ae7be119700bd3f047cb30e3ca9b71 +size 20540 diff --git a/metrics/npz/train_eval/step-000000734003200.npz b/metrics/npz/train_eval/step-000000734003200.npz new file mode 100644 index 0000000000000000000000000000000000000000..b374f7583913081ca481b66bbf4efa45489dfa8b --- /dev/null +++ b/metrics/npz/train_eval/step-000000734003200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e76699c131d8e6d0f7cd719cbe230c56e47b4bca234192719ad4649902fae4 +size 20540 diff --git a/metrics/npz/train_eval/step-000000838860800.npz b/metrics/npz/train_eval/step-000000838860800.npz new file mode 100644 index 0000000000000000000000000000000000000000..b74b11424e13d73536a6889d66b884ba86ef38af --- /dev/null +++ b/metrics/npz/train_eval/step-000000838860800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f47c19fa159caf1295d0d460af85964dcdd4c37670b59d41eb79c3fe0c8b895 +size 20540 diff --git a/metrics/npz/train_eval/step-000000943718400.npz b/metrics/npz/train_eval/step-000000943718400.npz new file mode 100644 index 0000000000000000000000000000000000000000..d42433969612efaf48bac25d1b7379e31a9f2183 --- /dev/null +++ b/metrics/npz/train_eval/step-000000943718400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df19e0ff0a033edd88053db29234a4b02b591cbdcc2fdf222828241ecd02b8bc +size 20540 diff --git a/metrics/npz/train_eval/step-000001048576000.npz b/metrics/npz/train_eval/step-000001048576000.npz new file mode 100644 index 0000000000000000000000000000000000000000..2b99c2e865065ea73b0dbe50eadfd7282bc72490 --- /dev/null +++ b/metrics/npz/train_eval/step-000001048576000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b7f9ed35e5f9886999d642ee9d6ffa9a0386daa64d1b57e474812aca476c28 +size 20540 diff --git a/metrics/npz/train_eval/step-000001153433600.npz b/metrics/npz/train_eval/step-000001153433600.npz new file mode 100644 index 0000000000000000000000000000000000000000..dd6c2f53dab3a49288be242211622f8b4348155a --- /dev/null +++ b/metrics/npz/train_eval/step-000001153433600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f75b9ebe37dddd1c3b2912483e32d469564bd5a7c5ebece04ee431d5ff904ef +size 20540 diff --git a/metrics/npz/train_eval/step-000001258291200.npz b/metrics/npz/train_eval/step-000001258291200.npz new file mode 100644 index 0000000000000000000000000000000000000000..62a5370d4f3202797a7b5635a91b9b0fa852ed10 --- /dev/null +++ b/metrics/npz/train_eval/step-000001258291200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79766f3ac87cfd4ef7f8e74f514833b119d04a1dde9fd8e4c089a71acad171c0 +size 20540 diff --git a/metrics/npz/train_eval/step-000001363148800.npz b/metrics/npz/train_eval/step-000001363148800.npz new file mode 100644 index 0000000000000000000000000000000000000000..f546aa86eec4cbb4b07f7b75a7aa540dec8a300f --- /dev/null +++ b/metrics/npz/train_eval/step-000001363148800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d09b949f3bfd44c20eaec8a46b355579efcbad3595fa38291ed27150d74477f +size 20540 diff --git a/metrics/npz/train_eval/step-000001468006400.npz b/metrics/npz/train_eval/step-000001468006400.npz new file mode 100644 index 0000000000000000000000000000000000000000..189b3680286f218fe54c98ef990a38ce474f1d6f --- /dev/null +++ b/metrics/npz/train_eval/step-000001468006400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6f61212376ce5ecdf18ef9493a6c0223bab3aeb36af2e1e5be425b92fde665d +size 20540 diff --git a/metrics/npz/train_eval/step-000001572864000.npz b/metrics/npz/train_eval/step-000001572864000.npz new file mode 100644 index 0000000000000000000000000000000000000000..b8df3a9e43fe1dc31fef2b0339b1ab535a717f60 --- /dev/null +++ b/metrics/npz/train_eval/step-000001572864000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbd4ea47c5d63c2ec3c694bcfde5a91d29a384420c814f22586d733d88efbb88 +size 20540 diff --git a/metrics/npz/train_eval/step-000001677721600.npz b/metrics/npz/train_eval/step-000001677721600.npz new file mode 100644 index 0000000000000000000000000000000000000000..27356f140e58fbfef8d782bdab407b7c221c0ddf --- /dev/null +++ b/metrics/npz/train_eval/step-000001677721600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c15b6d80f4a397b7fed86299a2c141fe55b9330d46e1544307a450d24d70c64 +size 20540 diff --git a/metrics/npz/train_eval/step-000001782579200.npz b/metrics/npz/train_eval/step-000001782579200.npz new file mode 100644 index 0000000000000000000000000000000000000000..8726489310319a7a6ef9f130274b40e2cdd0253e --- /dev/null +++ b/metrics/npz/train_eval/step-000001782579200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f28f30882304a98842ed8cfd5cf70cbb0b7484169d626eac1e83bc5a2aabbd21 +size 20540 diff --git a/metrics/npz/train_eval/step-000001887436800.npz b/metrics/npz/train_eval/step-000001887436800.npz new file mode 100644 index 0000000000000000000000000000000000000000..b48bcc8e84daee1271e1af9291e2f3a68b2c373b --- /dev/null +++ b/metrics/npz/train_eval/step-000001887436800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f491f5060e7c7f7db5980014ab0ba7298919c941d3857d68b18e110d513cf50b +size 20540 diff --git a/metrics/npz/train_eval/step-000001992294400.npz b/metrics/npz/train_eval/step-000001992294400.npz new file mode 100644 index 0000000000000000000000000000000000000000..3b904e474b277c30c070edf0d486f510cfe41a21 --- /dev/null +++ b/metrics/npz/train_eval/step-000001992294400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90d312ff4607ef14f48c9ef6193f4d24a52e1159a3457ff427bdb1db4d074ded +size 20540 diff --git a/metrics/npz/val/step-000000041943040.npz b/metrics/npz/val/step-000000041943040.npz new file mode 100644 index 0000000000000000000000000000000000000000..b6c34a81e644b711fdcfd8bbfe3c9d285d49fd12 --- /dev/null +++ b/metrics/npz/val/step-000000041943040.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc1cd46907807a9d180ed63a2a0b1be153ae7541d05b2addfa488754b3429715 +size 21142 diff --git a/metrics/npz/val/step-000000083886080.npz b/metrics/npz/val/step-000000083886080.npz new file mode 100644 index 0000000000000000000000000000000000000000..c78b69f7198fd853acd2b99cb4f230da3a1d9534 --- /dev/null +++ b/metrics/npz/val/step-000000083886080.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34bdecf282d585c995b14f247ca216cc51525356b32de99aefab8cd3cbdefada +size 21142 diff --git a/metrics/npz/val/step-000000125829120.npz b/metrics/npz/val/step-000000125829120.npz new file mode 100644 index 0000000000000000000000000000000000000000..411d4e6cfc31af9e10d52226326466049fc3d449 --- /dev/null +++ b/metrics/npz/val/step-000000125829120.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdcc31165de1dd9095081d67df90788c10446e9733e99500adc2f038e75afd0c +size 21142 diff --git a/metrics/npz/val/step-000000167772160.npz b/metrics/npz/val/step-000000167772160.npz new file mode 100644 index 0000000000000000000000000000000000000000..17234ef0ddd8bc76fe89acc43d7f1319bd95be2e --- /dev/null +++ b/metrics/npz/val/step-000000167772160.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f9a1a668b813972194741279f49eba283735c7f7114746a0fbf305e7b3f665 +size 21142 diff --git a/metrics/npz/val/step-000000209715200.npz b/metrics/npz/val/step-000000209715200.npz new file mode 100644 index 0000000000000000000000000000000000000000..317d97ed5445c7d2aabe576d04b2228450bd1cd8 --- /dev/null +++ b/metrics/npz/val/step-000000209715200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4221bfe9514a238e7ab25befb00741f9ab1f3b7c7728786b23ae0c887cc6fdcd +size 21142 diff --git a/metrics/npz/val/step-000000251658240.npz b/metrics/npz/val/step-000000251658240.npz new file mode 100644 index 0000000000000000000000000000000000000000..7efbb6048bfd1df93383deee67312b9e33620c0f --- /dev/null +++ b/metrics/npz/val/step-000000251658240.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b16a204bf8f72ed80290c00bc18b7309c928005afd4b0939e2e8add9a769478 +size 21142 diff --git a/metrics/npz/val/step-000000293601280.npz b/metrics/npz/val/step-000000293601280.npz new file mode 100644 index 0000000000000000000000000000000000000000..18a7d5707866899e2013146489dcd1b5d178c2e7 --- /dev/null +++ b/metrics/npz/val/step-000000293601280.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dca15431ecd121de534d1c54814818b25b9d5bbd36069275cb4fda297d63845 +size 21142 diff --git a/metrics/npz/val/step-000000335544320.npz b/metrics/npz/val/step-000000335544320.npz new file mode 100644 index 0000000000000000000000000000000000000000..0d547e5168510c8a4067c4ecb9462bf284010464 --- /dev/null +++ b/metrics/npz/val/step-000000335544320.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fedcf5206f7aad59fb15788233efb3b939f34e8689409c2dea23405034b40db +size 21142 diff --git a/metrics/npz/val/step-000000377487360.npz b/metrics/npz/val/step-000000377487360.npz new file mode 100644 index 0000000000000000000000000000000000000000..e14ec6953df9eb767c9eece97f82d5850b42a77d --- /dev/null +++ b/metrics/npz/val/step-000000377487360.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e61c8f7fd678b2236fcc80566e5a0b6753d1c6c4ef82d39e1ee4cb50a074c00 +size 21142 diff --git a/metrics/npz/val/step-000000419430400.npz b/metrics/npz/val/step-000000419430400.npz new file mode 100644 index 0000000000000000000000000000000000000000..4c8d1b47c5e29ce89a5328f9d8ed54b94758894a --- /dev/null +++ b/metrics/npz/val/step-000000419430400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed46bdc8228e8fa17adac6e50ddb7a40184e161982cd9070652268e552af019 +size 21142 diff --git a/metrics/npz/val/step-000000461373440.npz b/metrics/npz/val/step-000000461373440.npz new file mode 100644 index 0000000000000000000000000000000000000000..c6b21c835292048e629d014d9c4b81f8f1d92765 --- /dev/null +++ b/metrics/npz/val/step-000000461373440.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0c8a43f58cb0c6d71675c10a23b859d5efd87d8d0c0f31994ad862f159ca443 +size 21142 diff --git a/metrics/npz/val/step-000000503316480.npz b/metrics/npz/val/step-000000503316480.npz new file mode 100644 index 0000000000000000000000000000000000000000..2665c60528f6841ddcab3fb592ff6531b3c0cba8 --- /dev/null +++ b/metrics/npz/val/step-000000503316480.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f61d6951f9a52fefdecc5642612894b98c63bc8ace01c7ff127dd563dda431cf +size 21142 diff --git a/metrics/npz/val/step-000000545259520.npz b/metrics/npz/val/step-000000545259520.npz new file mode 100644 index 0000000000000000000000000000000000000000..e9f71570b5ab59d138f81ebe539cd4734c883be2 --- /dev/null +++ b/metrics/npz/val/step-000000545259520.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572b4382fb75355cf678c044b9a4e1ba4cbfda180b6a07fc318523e076716ff0 +size 21142 diff --git a/metrics/npz/val/step-000000587202560.npz b/metrics/npz/val/step-000000587202560.npz new file mode 100644 index 0000000000000000000000000000000000000000..c9388ef3ddf2ad3d49c8345bbc6d07fa4917402c --- /dev/null +++ b/metrics/npz/val/step-000000587202560.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d14f3309882ccf59cbb71a39b4f0d96401f816be34a3e5cefc25571f3864df9 +size 21142 diff --git a/metrics/npz/val/step-000000629145600.npz b/metrics/npz/val/step-000000629145600.npz new file mode 100644 index 0000000000000000000000000000000000000000..e378a48d26c04b64f31ea6d808a62a9dd1c6ca6f --- /dev/null +++ b/metrics/npz/val/step-000000629145600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0393a8d1094fd030b97fc2f4959ad87a46ec686c61a2d8730c60c4993cad7ae +size 21142 diff --git a/metrics/npz/val/step-000000671088640.npz b/metrics/npz/val/step-000000671088640.npz new file mode 100644 index 0000000000000000000000000000000000000000..13bb392309866130532ccb7111349072b084f421 --- /dev/null +++ b/metrics/npz/val/step-000000671088640.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b80a6bf7cc750d4cfd50af8f32a6a1cde895468bc2ef4aa5f2484aec22602f +size 21142 diff --git a/metrics/npz/val/step-000000713031680.npz b/metrics/npz/val/step-000000713031680.npz new file mode 100644 index 0000000000000000000000000000000000000000..067f51f3c67c0dc87d98c9020ede259030d75052 --- /dev/null +++ b/metrics/npz/val/step-000000713031680.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e019d300d05bb2e872ff2867d6846538e7ae834a32e6a81f708b56014323a63 +size 21142 diff --git a/metrics/npz/val/step-000000754974720.npz b/metrics/npz/val/step-000000754974720.npz new file mode 100644 index 0000000000000000000000000000000000000000..ad8d9c8d5846a40d80678cedb87eb6f846b05d83 --- /dev/null +++ b/metrics/npz/val/step-000000754974720.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7eaaaa565029746ed582ffecc69639aaf6903c02283597284890cdb90bfdfb7 +size 21142 diff --git a/metrics/npz/val/step-000000796917760.npz b/metrics/npz/val/step-000000796917760.npz new file mode 100644 index 0000000000000000000000000000000000000000..1618b789861c61786d9f44d56e69585cbe15bfac --- /dev/null +++ b/metrics/npz/val/step-000000796917760.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcb814b3861e19e754977fc40634df63b0b83daf52de6d7511917612609ee7a8 +size 21142 diff --git a/metrics/npz/val/step-000000838860800.npz b/metrics/npz/val/step-000000838860800.npz new file mode 100644 index 0000000000000000000000000000000000000000..028ea6dbe5f498de70b4909bf0569f9e9382daeb --- /dev/null +++ b/metrics/npz/val/step-000000838860800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77cb747ca74cfed1be758285eb5f6e0f4398b11168efacc4e8188193c77fe21e +size 21142 diff --git a/metrics/npz/val/step-000000880803840.npz b/metrics/npz/val/step-000000880803840.npz new file mode 100644 index 0000000000000000000000000000000000000000..ee5f073a0003f0e9c7984cae1c0cb42510e9991a --- /dev/null +++ b/metrics/npz/val/step-000000880803840.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1725dba34cf781a71b155b65ed08661821d83de246b96a3433a68b9b41803dc1 +size 21142 diff --git a/metrics/npz/val/step-000000922746880.npz b/metrics/npz/val/step-000000922746880.npz new file mode 100644 index 0000000000000000000000000000000000000000..e2e8a443c89fecb5fffeafdb84a0ba21b7099d5f --- /dev/null +++ b/metrics/npz/val/step-000000922746880.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f33e4ff6fc9958218775d5135d2b7ceaaf793bcfcc520db28ab3c96e0bfe6c4 +size 21142 diff --git a/metrics/npz/val/step-000000964689920.npz b/metrics/npz/val/step-000000964689920.npz new file mode 100644 index 0000000000000000000000000000000000000000..c0663fff1d30b76f663b8b45f3a249e354f42f6c --- /dev/null +++ b/metrics/npz/val/step-000000964689920.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90026c6af79a8ed41445056e004e221513b9bd91d608e4c3024251bdb5be6ca7 +size 21142 diff --git a/metrics/npz/val/step-000001006632960.npz b/metrics/npz/val/step-000001006632960.npz new file mode 100644 index 0000000000000000000000000000000000000000..0c64b772ebf1e7224b2c9bf49bfc6b67408c236d --- /dev/null +++ b/metrics/npz/val/step-000001006632960.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc6d09bec5b2f8eabed886211b11d8d9acbbf9c2fda881b2bc1c58cbdd29a28 +size 21142 diff --git a/metrics/npz/val/step-000001048576000.npz b/metrics/npz/val/step-000001048576000.npz new file mode 100644 index 0000000000000000000000000000000000000000..f22e6016e12136de6463a124df60c0912edf8a6a --- /dev/null +++ b/metrics/npz/val/step-000001048576000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74447d25a0bf722c4bbad704a5a7a7cdaf9232cd92a87b9d8c59f759798c4227 +size 21142 diff --git a/metrics/npz/val/step-000001090519040.npz b/metrics/npz/val/step-000001090519040.npz new file mode 100644 index 0000000000000000000000000000000000000000..d990ff4b558d0f07ab818d70c42f311bfec938fd --- /dev/null +++ b/metrics/npz/val/step-000001090519040.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5480779ae8563253da3adb81737f4dbe78ab9a375f05a163e516a7ca7ed8527c +size 21142 diff --git a/metrics/npz/val/step-000001132462080.npz b/metrics/npz/val/step-000001132462080.npz new file mode 100644 index 0000000000000000000000000000000000000000..50b1811ca4568cb71512426216cdc50228dfd5af --- /dev/null +++ b/metrics/npz/val/step-000001132462080.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c3d6b94916e3c5eb7dfc11bf1eb837cfebc635f4b784360b758215ad25ee98 +size 21142 diff --git a/metrics/npz/val/step-000001174405120.npz b/metrics/npz/val/step-000001174405120.npz new file mode 100644 index 0000000000000000000000000000000000000000..35f317d0723e0b4f48da3b3fd31913a44914be63 --- /dev/null +++ b/metrics/npz/val/step-000001174405120.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41f3e06229c66bb93ef5e5993b6f793c6e32bb600bec9b1d9f90f1e4b95a3e0 +size 21142 diff --git a/metrics/npz/val/step-000001216348160.npz b/metrics/npz/val/step-000001216348160.npz new file mode 100644 index 0000000000000000000000000000000000000000..8a06faa74cb70ec1694eaaee5c8b6c198471e486 --- /dev/null +++ b/metrics/npz/val/step-000001216348160.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f330a2295fb780847f18599a4f6013811999c92cec728f3ec1cc952a85e0ea +size 21142 diff --git a/metrics/npz/val/step-000001258291200.npz b/metrics/npz/val/step-000001258291200.npz new file mode 100644 index 0000000000000000000000000000000000000000..3e7f2059362de3db96cddb710787c8970684f133 --- /dev/null +++ b/metrics/npz/val/step-000001258291200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:869687c5d554a83f6fe7c2d9debf02f385da94b59a41d788e87898bbc7c11a79 +size 21142 diff --git a/metrics/npz/val/step-000001300234240.npz b/metrics/npz/val/step-000001300234240.npz new file mode 100644 index 0000000000000000000000000000000000000000..065637ec0b5aa7793b723f5244c439b658fac320 --- /dev/null +++ b/metrics/npz/val/step-000001300234240.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:292bcdca9eaf54bea4a6312ac0bd39fe777c1f4379b75ab98630871673660d21 +size 21142 diff --git a/metrics/npz/val/step-000001342177280.npz b/metrics/npz/val/step-000001342177280.npz new file mode 100644 index 0000000000000000000000000000000000000000..07df729b8e5b954129f1d9ac1e45d0a09b524235 --- /dev/null +++ b/metrics/npz/val/step-000001342177280.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb36c208a17eece7b338629bb94ff321617301782dd3d7e4b3a0091fcaa80cb +size 21142 diff --git a/metrics/npz/val/step-000001384120320.npz b/metrics/npz/val/step-000001384120320.npz new file mode 100644 index 0000000000000000000000000000000000000000..d5b40e81d1ca5d72dc69f7f6bdd055d4978f7fed --- /dev/null +++ b/metrics/npz/val/step-000001384120320.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:865ff5346b8e091b322a37c9b55ed0d066ae53cff65a73ea6bb75a5a515a967d +size 21142 diff --git a/metrics/npz/val/step-000001426063360.npz b/metrics/npz/val/step-000001426063360.npz new file mode 100644 index 0000000000000000000000000000000000000000..217f51647e6d6531c2d18236fbfad8b527eca5c0 --- /dev/null +++ b/metrics/npz/val/step-000001426063360.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a13c918f03ab4bcd7ec382033a26bb50f3f1dd5b447e249e3a29714c2916820 +size 21142 diff --git a/metrics/npz/val/step-000001468006400.npz b/metrics/npz/val/step-000001468006400.npz new file mode 100644 index 0000000000000000000000000000000000000000..ce262279ea172b491efd89a74d3b9f96e2c5a568 --- /dev/null +++ b/metrics/npz/val/step-000001468006400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00fa29bdbd76952d7b1ff1acb8a907eab0c8686b71a43bfe692df0aa29cc5acf +size 21142 diff --git a/metrics/npz/val/step-000001509949440.npz b/metrics/npz/val/step-000001509949440.npz new file mode 100644 index 0000000000000000000000000000000000000000..3bf7653556359260d10c33a7239ac091bb86a36a --- /dev/null +++ b/metrics/npz/val/step-000001509949440.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288e4321df8ebf5ff18e444f55bab8483b4c8a938f0be4c339efc23abe94da98 +size 21142 diff --git a/metrics/npz/val/step-000001551892480.npz b/metrics/npz/val/step-000001551892480.npz new file mode 100644 index 0000000000000000000000000000000000000000..e3e3efbc0ffaeead8b183154a546fa2e33bf4be5 --- /dev/null +++ b/metrics/npz/val/step-000001551892480.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d485ec54064c64d3fabce9b19d8773560fee2574984c44bcb9332e7b8b7c5d +size 21142 diff --git a/metrics/npz/val/step-000001593835520.npz b/metrics/npz/val/step-000001593835520.npz new file mode 100644 index 0000000000000000000000000000000000000000..7bb0d0b8875ec1b36fbf7bcb20ebea6f8727cb92 --- /dev/null +++ b/metrics/npz/val/step-000001593835520.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae52ab8067e3854af13648aaeefcff88b3cbf537e7ea9f15a9d8433ac7e4d91 +size 21142 diff --git a/metrics/npz/val/step-000001635778560.npz b/metrics/npz/val/step-000001635778560.npz new file mode 100644 index 0000000000000000000000000000000000000000..71727b28d2a1c706c921104d92edbb40ce5740ea --- /dev/null +++ b/metrics/npz/val/step-000001635778560.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:825a935801102b6f7a99bbacd77f4f3be0fd51f33a8c62003fc06ce128f4f720 +size 21142 diff --git a/metrics/npz/val/step-000001677721600.npz b/metrics/npz/val/step-000001677721600.npz new file mode 100644 index 0000000000000000000000000000000000000000..bf5b2d2a568f3c93d3d1846a9b028021f5d1da9a --- /dev/null +++ b/metrics/npz/val/step-000001677721600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1049e5a5d0b2720037bd8215578c4b77b55b00ff27465c4f4148828712a41a8d +size 21142 diff --git a/metrics/npz/val/step-000001719664640.npz b/metrics/npz/val/step-000001719664640.npz new file mode 100644 index 0000000000000000000000000000000000000000..d0ec74612011f984fb588bff74b47942730db41f --- /dev/null +++ b/metrics/npz/val/step-000001719664640.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13f641a59dedace403bf1781b2014b9b32242249d57231422ddfdf3b9414e928 +size 21142 diff --git a/metrics/npz/val/step-000001761607680.npz b/metrics/npz/val/step-000001761607680.npz new file mode 100644 index 0000000000000000000000000000000000000000..326207f92b6799eacfbee13eb54d918668ce85cd --- /dev/null +++ b/metrics/npz/val/step-000001761607680.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1046ea35784152e48384d0004b1d9e0df9dcba66822d2ea1eb042e6555508e7c +size 21142 diff --git a/metrics/npz/val/step-000001803550720.npz b/metrics/npz/val/step-000001803550720.npz new file mode 100644 index 0000000000000000000000000000000000000000..c787c5f363403cb516dbab38b74fa914c2defac1 --- /dev/null +++ b/metrics/npz/val/step-000001803550720.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c29ba1e2ac01fcff24cb594ba0d98aa93ce504063163214a78a4e6442cb3b8f +size 21142 diff --git a/metrics/npz/val/step-000001845493760.npz b/metrics/npz/val/step-000001845493760.npz new file mode 100644 index 0000000000000000000000000000000000000000..977bc2b186b94681b8922159d8cb4481a5bc978c --- /dev/null +++ b/metrics/npz/val/step-000001845493760.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d389ed97f4ab00283718c8be41e75d1e6f77585d4459a1ac4f3b6b93bb5595d4 +size 21142 diff --git a/metrics/npz/val/step-000001887436800.npz b/metrics/npz/val/step-000001887436800.npz new file mode 100644 index 0000000000000000000000000000000000000000..bda093498c073e7eecfe7cff28e8494917083c7c --- /dev/null +++ b/metrics/npz/val/step-000001887436800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2cb7a79e67872cdea941a39afcb57764428bc119fdc75edd5b58b17bb7de39 +size 21142 diff --git a/metrics/npz/val/step-000001929379840.npz b/metrics/npz/val/step-000001929379840.npz new file mode 100644 index 0000000000000000000000000000000000000000..0c673d582a5674651131e37a29c2a05d68466c4f --- /dev/null +++ b/metrics/npz/val/step-000001929379840.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:609e3f73984c19fb9945888c2b59ed50e43be2d7666e852ceeffbfea28b4e7fc +size 21142 diff --git a/metrics/npz/val/step-000001971322880.npz b/metrics/npz/val/step-000001971322880.npz new file mode 100644 index 0000000000000000000000000000000000000000..f07e00ec78dda71df638d9abdf78def8d1a1ea7a --- /dev/null +++ b/metrics/npz/val/step-000001971322880.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39e35c7bb0dee8ef7ca770cfe1aa3e0a2060e0ca39414f36188db07fb1cfdb82 +size 21142 diff --git a/metrics/npz/val/step-000002013265920.npz b/metrics/npz/val/step-000002013265920.npz new file mode 100644 index 0000000000000000000000000000000000000000..7f3e51fb7126406bc7db296b46e27b042f8990ad --- /dev/null +++ b/metrics/npz/val/step-000002013265920.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8a426b9951add5db8360445b7bd97903bfaef045ef5d16d815430ad78f38376 +size 21142 diff --git a/metrics/npz/val/step-000002055208960.npz b/metrics/npz/val/step-000002055208960.npz new file mode 100644 index 0000000000000000000000000000000000000000..73d74fc9dbadc1bedcdd6531945d7a00cb86c9dc --- /dev/null +++ b/metrics/npz/val/step-000002055208960.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc095347e294df62882d613a14026315f7302d9cf4b5629e5852ce21e4bb3601 +size 21142 diff --git a/metrics/wandb/wandb_run_id.txt b/metrics/wandb/wandb_run_id.txt new file mode 100644 index 0000000000000000000000000000000000000000..507538c7386229e2fe3ff620053e6d085a03a0bb --- /dev/null +++ b/metrics/wandb/wandb_run_id.txt @@ -0,0 +1 @@ +0m6wmz8p \ No newline at end of file diff --git a/model.txt b/model.txt new file mode 100644 index 0000000000000000000000000000000000000000..819abad834c1ccee276173b5e865ba0eac740ec0 --- /dev/null +++ b/model.txt @@ -0,0 +1,25 @@ +ForgettingTransformerForCausalLM( + (model): ForgettingTransformerModel( + (embeddings): Embedding(50277, 256) + (layers): ModuleList( + (0-1): 2 x ForgettingTransformerBlock( + (attn_norm): RMSNorm(256, eps=1e-06) + (attn): ForgettingAttentionLayer( + (q_proj): Linear(in_features=256, out_features=256, bias=False) + (k_proj): Linear(in_features=256, out_features=256, bias=False) + (v_proj): Linear(in_features=256, out_features=256, bias=False) + (o_proj): Linear(in_features=256, out_features=256, bias=False) + (fgate_proj): Linear(in_features=256, out_features=4, bias=True) + ) + (mlp_norm): RMSNorm(256, eps=1e-06) + (mlp): ForgettingTransformerMLP( + (gate_proj): Linear(in_features=256, out_features=1536, bias=False) + (down_proj): Linear(in_features=768, out_features=256, bias=False) + (act_fn): SiLU() + ) + ) + ) + (norm): RMSNorm(256, eps=1e-06) + ) + (lm_head): Linear(in_features=256, out_features=50277, bias=False) +) diff --git a/modeling_transformer.py b/modeling_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b70f526e9824d2cf247dcac4ce13cf288351bddb --- /dev/null +++ b/modeling_transformer.py @@ -0,0 +1,573 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import functional as F +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_outputs import (BaseModelOutputWithPast, + CausalLMOutputWithPast) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging + +# from fla.layers.attn import Attention +from fla.modules import FusedCrossEntropyLoss, RMSNorm +from fla.modules.activations import swiglu_linear + +from fla.modules import RotaryEmbedding +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import (index_first_axis, pad_input, + unpad_input) +except ImportError: + warnings.warn("Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`") + flash_attn_func = None +from einops import rearrange + +from forgetting_transformer.model.transformer.configuration_transformer import TransformerConfig + +from functools import partial + +logger = logging.get_logger(__name__) + +class Attention(nn.Module): + + def __init__( + self, + hidden_size: int = 2048, + num_heads: int = 32, + num_kv_heads: Optional[int] = None, + window_size: Optional[int] = None, + max_position_embeddings: Optional[int] = None, + rope_base: float = 500000.0, + use_rope: bool = True, + layer_idx: int = None, + ): + super().__init__() + + self.num_heads = num_heads + if num_kv_heads is None: + self.num_kv_heads = self.num_heads + else: + self.num_kv_heads = num_kv_heads + self.num_kv_groups = num_heads // self.num_kv_heads + self.hidden_size = hidden_size + self.head_dim = self.hidden_size // self.num_heads + self.kv_dim = self.num_kv_heads * self.head_dim + self.kv_dim = self.num_kv_heads * self.head_dim + self.window_size = window_size + self.max_position_embeddings = max_position_embeddings + self.layer_idx = layer_idx + + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + if use_rope: + self.rotary = RotaryEmbedding(self.head_dim, base=rope_base) + else: + self.rotary = None + + + self.apply(self._initialize_weights) + + def _initialize_weights(self, module: nn.Module): + pass + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + batch_size, q_len, _ = hidden_states.size() + q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', h=self.num_heads) + k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', h=self.num_kv_heads) + v = rearrange(self.v_proj(hidden_states), 'b t (h d) -> b h t d', h=self.num_kv_heads) + + seqlen_offset, max_seqlen = 0, q.shape[1] + if past_key_values is not None: + seqlen_offset = past_key_values.get_seq_length(self.layer_idx) + max_seqlen = q.shape[1] + seqlen_offset + + if attention_mask is not None: + # to deliminate the offsets of padding tokens + seqlen_offset = (seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]) + max_seqlen = q.shape[1] + max(seqlen_offset) + + if self.max_position_embeddings is not None: + max_seqlen = max(max_seqlen, self.max_position_embeddings) + if self.rotary is not None: + q, k = self.rotary(q, k, seqlen_offset, max_seqlen) + + k = rearrange(k, 'b t h d -> b h t d') + if past_key_values is not None: + k, v = past_key_values.update(k, v, self.layer_idx) + k, v = rearrange(k, 'b h t d -> b t h d'), rearrange(v, 'b h t d -> b t h d') + if self.num_kv_groups > 1: + k = rearrange(k.unsqueeze(-2).repeat(1, 1, 1, self.num_kv_groups, 1), 'b t h g d -> b t (h g) d') + v = rearrange(v.unsqueeze(-2).repeat(1, 1, 1, self.num_kv_groups, 1), 'b t h g d -> b t (h g) d') + + if flash_attn_func is None: + raise ImportError("Please install Flash Attention via `pip install flash-attn --no-build-isolation` first") + + # Contains at least one padding token in the sequence + if attention_mask is not None: + q, k, v, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(q, k, v, attention_mask, q_len) + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_q, max_seqlen_k = max_seq_lens + o = flash_attn_varlen_func( + q, k, v, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + causal=True, + window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0) + ) + o = pad_input(o, indices_q, batch_size, q_len) + else: + o = flash_attn_func( + q, k, v, + causal=True, + window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0) + ) + o = o.reshape(batch_size, q_len, self.hidden_size) + o = self.o_proj(o) + + if not output_attentions: + attentions = None + + return o, attentions, past_key_values + + def _upad_input(self, q, k, v, attention_mask, q_len): + seqlens = attention_mask.sum(-1, dtype=torch.int32) + indices_k = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_k = seqlens.max().item() + cu_seqlens_k = F.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0)) + batch_size, seq_len, num_key_value_heads, head_dim = k.shape + + k = index_first_axis(k.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k) + v = index_first_axis(v.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k) + if q_len == seq_len: + q = index_first_axis(q.reshape(batch_size * seq_len, self.num_heads, head_dim), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_q = max_seqlen_k + indices_q = indices_k + elif q_len == 1: + max_seqlen_q = 1 + # There is a memcpy here, that is very bad. + cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device) + indices_q = cu_seqlens_q[:-1] + q = q.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -q_len:] + q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask) + + return q, k, v, indices_q, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) + + +class TransformerMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + hidden_ratio: Optional[int] = None, + intermediate_size: Optional[int] = None, + hidden_act: str = 'swish' + ) -> TransformerMLP: + super().__init__() + + self.hidden_size = hidden_size + # the final number of params is `hidden_ratio * hidden_size^2` + # `intermediate_size` is chosen to be a multiple of 256 closest to `2/3 * hidden_size * hidden_ratio` + if hidden_ratio is None: + hidden_ratio = 4 + if intermediate_size is None: + intermediate_size = int(hidden_size * hidden_ratio * 2 / 3) + intermediate_size = 256 * ((intermediate_size + 256 - 1) // 256) + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + y = self.gate_proj(x) + gate, y = y.chunk(2, -1) + # TODO: maybe wrap swiglu_linear in custom_fwd/custom_bwd + return swiglu_linear( + gate, y, + self.down_proj.weight.to(y.dtype), + self.down_proj.bias.to(y.dtype) if self.down_proj.bias is not None else self.down_proj.bias + ) + + +class TransformerBlock(nn.Module): + def __init__(self, config: TransformerConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.attn_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps) + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + window_size=config.window_size, + max_position_embeddings=config.max_position_embeddings, + rope_base=config.rope_base, + use_rope=config.use_rope, + layer_idx=layer_idx + ) + self.mlp_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps) + self.mlp = TransformerMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act + ) + + def forward_attn( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ): + # reisual handled outside + # residual = hidden_states + hidden_states = self.attn_norm(hidden_states) + hidden_states, attentions, past_key_values = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions + ) + return hidden_states, attentions, past_key_values + + def forward_mlp( + self, + hidden_states: torch.Tensor, + residual: torch.Tensor, + ): + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + gradient_checkpointing: bool = False + # **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + + + if gradient_checkpointing: + forward_attn = partial(torch.utils.checkpoint.checkpoint, self.forward_attn, use_reentrant=False) + forward_mlp = partial(torch.utils.checkpoint.checkpoint, self.forward_mlp, use_reentrant=False) + else: + forward_attn = self.forward_attn + forward_mlp = self.forward_mlp + + hidden_states, attentions, past_key_values = forward_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions + ) + + hidden_states = forward_mlp( + hidden_states, + residual, + ) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attentions,) + + if use_cache: + outputs += (past_key_values,) + + return outputs + + + +class TransformerPreTrainedModel(PreTrainedModel): + + config_class = TransformerConfig + supports_gradient_checkpointing = True + _no_split_modules = ['TransformerBlock'] + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class TransformerModel(TransformerPreTrainedModel): + + def __init__(self, config: TransformerConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([TransformerBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]) + self.norm = RMSNorm(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None + ) -> Union[Tuple, CausalLMOutputWithPast]: + if output_attentions: + warnings.warn( + "`TransformerModel` does not support output attention weights now, so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + next_decoder_cache = None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + gradient_checkpointing=self.gradient_checkpointing and self.training + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attns] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class TransformerForCausalLM(TransformerPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = TransformerModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is passed along. + if past_key_values is not None: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': kwargs.get('use_cache'), + 'attention_mask': attention_mask, + }) + return model_inputs + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + hidden_states = outputs[0] + + loss = None + if labels is not None: + if self.config.fuse_cross_entropy: + loss_fct = FusedCrossEntropyLoss(inplace_backward=True, reduction='none') + else: + loss_fct = nn.CrossEntropyLoss(reduction='none') + logits = self.lm_head(hidden_states) + # Enable model parallelism + labels = labels.to(logits.device) + # labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], loss_fct.ignore_index)), 1) + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + loss = loss.view(*labels.size()) + del logits + logits = None + else: + logits = self.lm_head(hidden_states) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/no_decay_params.txt b/no_decay_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c4b12696cb99eec005ff99e84da909eed60e0c5 --- /dev/null +++ b/no_decay_params.txt @@ -0,0 +1,7 @@ +_forward_module.model.layers.0.attn_norm.weight +_forward_module.model.layers.0.attn.fgate_proj.bias +_forward_module.model.layers.0.mlp_norm.weight +_forward_module.model.layers.1.attn_norm.weight +_forward_module.model.layers.1.attn.fgate_proj.bias +_forward_module.model.layers.1.mlp_norm.weight +_forward_module.model.norm.weight