diff --git a/.hydra/config.yaml b/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14684e98ad2ab28a7eea3c8d9bf791d96c2d2fe5 --- /dev/null +++ b/.hydra/config.yaml @@ -0,0 +1,102 @@ +model: + _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM + config: + _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig + vocab_size: ??? + hidden_size: 512 + hidden_ratio: 4.0 + intermediate_size: null + num_hidden_layers: 6 + num_heads: 8 + num_kv_heads: null + hidden_act: swish + window_size: null + max_position_embeddings: null + initializer_range: 0.02 + elementwise_affine: true + norm_eps: 1.0e-06 + use_cache: true + pad_token_id: null + bos_token_id: null + eos_token_id: null + tie_word_embeddings: false + attention_bias: false + fuse_norm: true + fuse_cross_entropy: true + rope_base: 500000.0 + use_rope: false + use_output_gate: false + ogate_act: sigmoid + fgate_type: full + fgate_bias_init: false + decay_time_min: null + decay_time_max: null + use_output_norm: false + qk_norm: false + qk_norm_share_param_across_head: false + use_k_shift: false + use_v_shift: false +optimizer: + _target_: torch.optim.AdamW + lr: 0.001 + betas: + - 0.9 + - 0.95 + weight_decay: 0.1 +schedule: + _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule + init_value: 0.0 + peak_value: ${optimizer.lr} + warmup_steps: 20971520 + decay_steps: ${train.max_tokens} + end_value: 0.0 +datamodule: + _target_: forgetting_transformer.datamodule.npy.NpyDataModule + data_path: ${data_dir} + rank: ??? + world_size: ??? + train_batch_len: 2048 + train_batch_size: 1024 + train_num_workers: 0 + eval_tokens: 2147483648 + eval_batch_len: 2048 + eval_local_batch_size: 1 + eval_num_workers: 0 +strategy: + _target_: lightning.fabric.strategies.SingleDeviceStrategy + device: cuda:0 +exp: forgetting_gate_6_8_512 +tag: forgetting_gate_6_8_512 +seed: 42 +hf_load_dir: null +hf_save_dir: null +hf_load_step: null +output_dir: ./forgetting_gate_6_8_512/ +data_dir: /workspace/forgetting-transformer/data +resume: false +fork_dir: null +fork_step: null +log_interval: 20971520 +eval_interval: 41943040 +final_eval: true +skip_eval: false +checkpoint_interval: 209715200 +train_eval_interval: 104857600 +checkpoint_keep_interval: 209715200 +fabric: + devices: 1 + precision: 16-mixed +train: + max_tokens: 2097152000 + grad_acc_tokens: 32768 + max_grad_norm: 1.0 + gradient_checkpointing: true + bias_weight_decay: false + normalization_weight_decay: false + conv_weight_decay: true +eval: + min_val_length: 512 +wandb: + project: forgetting-transformer + mode: online + log_dir: ./output/wandb diff --git a/.hydra/hydra.yaml b/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f9553d8fdc2fdd75d680d382675d47bf5780e5ab --- /dev/null +++ b/.hydra/hydra.yaml @@ -0,0 +1,146 @@ +hydra: + run: + dir: ${output_dir} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + root: null + disable_existing_loggers: false + job_logging: + version: 1 + root: null + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - +experiment/pile/forgetting_transformer=forgetting_gate_6_8_512 + - strategy=single_device + - datamodule=npy + - schedule=warmup_cosine + - optimizer=adamw + - model=forgetting_transformer + - data_dir=/workspace/forgetting-transformer/data + - fabric.devices=1 + - fabric.precision=16-mixed + - seed=42 + - exp=forgetting_gate_6_8_512 + - tag=forgetting_gate_6_8_512 + - output_dir=./forgetting_gate_6_8_512/ + - wandb.log_dir=./output/wandb + - wandb.mode=online + - resume=false + job: + name: train + chdir: null + override_dirname: +experiment/pile/forgetting_transformer=forgetting_gate_6_8_512,data_dir=/workspace/forgetting-transformer/data,datamodule=npy,exp=forgetting_gate_6_8_512,fabric.devices=1,fabric.precision=16-mixed,model=forgetting_transformer,optimizer=adamw,output_dir=./forgetting_gate_6_8_512/,resume=false,schedule=warmup_cosine,seed=42,strategy=single_device,tag=forgetting_gate_6_8_512,wandb.log_dir=./output/wandb,wandb.mode=online + id: ??? + num: ??? + config_name: config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /workspace/forgetting-transformer + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /workspace/forgetting-transformer/configs + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /workspace/forgetting-transformer/forgetting_gate_6_8_512 + choices: + experiment/pile/forgetting_transformer: forgetting_gate_6_8_512 + strategy: single_device + datamodule: npy + schedule: warmup_cosine + optimizer: adamw + model: forgetting_transformer + hydra/env: default + hydra/callbacks: null + hydra/job_logging: none + hydra/hydra_logging: none + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/.hydra/overrides.yaml b/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4b7b19708e053cdc5a1ff4184bdb6c80229afaaf --- /dev/null +++ b/.hydra/overrides.yaml @@ -0,0 +1,16 @@ +- +experiment/pile/forgetting_transformer=forgetting_gate_6_8_512 +- strategy=single_device +- datamodule=npy +- schedule=warmup_cosine +- optimizer=adamw +- model=forgetting_transformer +- data_dir=/workspace/forgetting-transformer/data +- fabric.devices=1 +- fabric.precision=16-mixed +- seed=42 +- exp=forgetting_gate_6_8_512 +- tag=forgetting_gate_6_8_512 +- output_dir=./forgetting_gate_6_8_512/ +- wandb.log_dir=./output/wandb +- wandb.mode=online +- resume=false diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7f434981dc7bf9b220ed13f2cf53f70c18da7df0 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +# for HF remote code diff --git a/__pycache__/__init__.cpython-310.pyc b/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa6fc1efccec774fae6b6dc03279f15a36467852 Binary files /dev/null and b/__pycache__/__init__.cpython-310.pyc differ diff --git a/__pycache__/configuration_transformer.cpython-310.pyc b/__pycache__/configuration_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb235c4360610154373c138bbdaa933131675b9e Binary files /dev/null and b/__pycache__/configuration_transformer.cpython-310.pyc differ diff --git a/__pycache__/modeling_transformer.cpython-310.pyc b/__pycache__/modeling_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f86d01b5bf6302ce831f5b6f7d092301d79b7fe Binary files /dev/null and b/__pycache__/modeling_transformer.cpython-310.pyc differ diff --git a/checkpoints/step-000000209715200.pt b/checkpoints/step-000000209715200.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4a5b139d814b5741231dadcbd8013383d513fb9 --- /dev/null +++ b/checkpoints/step-000000209715200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:697d70565b110ccd3ada207ed7b21e56e216b17e2fffb9928d903bedfdb564c6 +size 863640826 diff --git a/checkpoints/step-000000209715200.pt.done b/checkpoints/step-000000209715200.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000209715200.pt.keep b/checkpoints/step-000000209715200.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000419430400.pt b/checkpoints/step-000000419430400.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfd10a5ab19ef9fd3fd0e03eeeaad16ce2f70121 --- /dev/null +++ b/checkpoints/step-000000419430400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0340f67aa43135d0ab0d57b9d019c367b6967eb07129fedc32d254a565e59c02 +size 863640826 diff --git a/checkpoints/step-000000419430400.pt.done b/checkpoints/step-000000419430400.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000419430400.pt.keep b/checkpoints/step-000000419430400.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000629145600.pt b/checkpoints/step-000000629145600.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2fbb86085641db3db1eb938cd9e6b664e11cfd4 --- /dev/null +++ b/checkpoints/step-000000629145600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261e4977576ea9d2abd1543a336220ddba150493734daacaddc8b6552f9d42fc +size 863640826 diff --git a/checkpoints/step-000000629145600.pt.done b/checkpoints/step-000000629145600.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000629145600.pt.keep b/checkpoints/step-000000629145600.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000838860800.pt b/checkpoints/step-000000838860800.pt new file mode 100644 index 0000000000000000000000000000000000000000..df2795fab79c8d63d516de01e8d0a1ed282e6e91 --- /dev/null +++ b/checkpoints/step-000000838860800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15d72a3b7ca85ef8d4110165a0a139a62c9be846d383bc3ecf9a8f13ee9dcbde +size 863640826 diff --git a/checkpoints/step-000000838860800.pt.done b/checkpoints/step-000000838860800.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000838860800.pt.keep b/checkpoints/step-000000838860800.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001048576000.pt b/checkpoints/step-000001048576000.pt new file mode 100644 index 0000000000000000000000000000000000000000..709215b4e538819a6fbb5d37e5b7ae6bad0ae2fa --- /dev/null +++ b/checkpoints/step-000001048576000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22830c44c675725d69e055da50e077fc4ee100810317b13d8be6c31716258ef1 +size 863640826 diff --git a/checkpoints/step-000001048576000.pt.done b/checkpoints/step-000001048576000.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001048576000.pt.keep b/checkpoints/step-000001048576000.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001258291200.pt b/checkpoints/step-000001258291200.pt new file mode 100644 index 0000000000000000000000000000000000000000..1abfc940235fceb75990f141abecaa9a9dd4019b --- /dev/null +++ b/checkpoints/step-000001258291200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:182e39f4302da45a9b8a6cd54bebdbcf1e186faf1e0060b4f76a496a5d1f5ed4 +size 863640826 diff --git a/checkpoints/step-000001258291200.pt.done b/checkpoints/step-000001258291200.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001258291200.pt.keep b/checkpoints/step-000001258291200.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001468006400.pt b/checkpoints/step-000001468006400.pt new file mode 100644 index 0000000000000000000000000000000000000000..d21cf337458df2d8b18b7f6b6e86005c59528936 --- /dev/null +++ b/checkpoints/step-000001468006400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f47ea6d55c9d7f7161e5b704fdc34772ffa72fbe5aafbf232a77486f120bd6 +size 863640826 diff --git a/checkpoints/step-000001468006400.pt.done b/checkpoints/step-000001468006400.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001468006400.pt.keep b/checkpoints/step-000001468006400.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001677721600.pt b/checkpoints/step-000001677721600.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6d4e279605e7cd742647b9c25bf6f67ed765a11 --- /dev/null +++ b/checkpoints/step-000001677721600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c659e61f20c730f11df6bbb3fde2a910c06fb9cc4d539ad608f410e39f60e8e +size 863640826 diff --git a/checkpoints/step-000001677721600.pt.done b/checkpoints/step-000001677721600.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001677721600.pt.keep b/checkpoints/step-000001677721600.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001887436800.pt b/checkpoints/step-000001887436800.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d6a38f71595a7f65a3cdc6d65728b396e651943 --- /dev/null +++ b/checkpoints/step-000001887436800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72801b56885f8ac587582bc288ab8ad35f5268c1b0ec2224bdff1b679f2e63a6 +size 863640826 diff --git a/checkpoints/step-000001887436800.pt.done b/checkpoints/step-000001887436800.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001887436800.pt.keep b/checkpoints/step-000001887436800.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..65df34565b3d5a845bf43617c0a5cd9ef9368016 --- /dev/null +++ b/config.yaml @@ -0,0 +1,102 @@ +model: + _target_: forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM + config: + _target_: forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig + vocab_size: ??? + hidden_size: 512 + hidden_ratio: 4.0 + intermediate_size: null + num_hidden_layers: 6 + num_heads: 8 + num_kv_heads: null + hidden_act: swish + window_size: null + max_position_embeddings: null + initializer_range: 0.02 + elementwise_affine: true + norm_eps: 1.0e-06 + use_cache: true + pad_token_id: null + bos_token_id: null + eos_token_id: null + tie_word_embeddings: false + attention_bias: false + fuse_norm: true + fuse_cross_entropy: true + rope_base: 500000.0 + use_rope: false + use_output_gate: false + ogate_act: sigmoid + fgate_type: full + fgate_bias_init: false + decay_time_min: null + decay_time_max: null + use_output_norm: false + qk_norm: false + qk_norm_share_param_across_head: false + use_k_shift: false + use_v_shift: false +optimizer: + _target_: torch.optim.AdamW + lr: 0.001 + betas: + - 0.9 + - 0.95 + weight_decay: 0.1 +schedule: + _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule + init_value: 0.0 + peak_value: 0.001 + warmup_steps: 20971520 + decay_steps: 2097152000 + end_value: 0.0 +datamodule: + _target_: forgetting_transformer.datamodule.npy.NpyDataModule + data_path: /workspace/forgetting-transformer/data + rank: ??? + world_size: ??? + train_batch_len: 2048 + train_batch_size: 1024 + train_num_workers: 0 + eval_tokens: 2147483648 + eval_batch_len: 2048 + eval_local_batch_size: 1 + eval_num_workers: 0 +strategy: + _target_: lightning.fabric.strategies.SingleDeviceStrategy + device: cuda:0 +exp: forgetting_gate_6_8_512 +tag: forgetting_gate_6_8_512 +seed: 42 +hf_load_dir: null +hf_save_dir: null +hf_load_step: null +output_dir: /workspace/forgetting-transformer/forgetting_gate_6_8_512 +data_dir: /workspace/forgetting-transformer/data +resume: false +fork_dir: null +fork_step: null +log_interval: 20971520 +eval_interval: 41943040 +final_eval: true +skip_eval: false +checkpoint_interval: 209715200 +train_eval_interval: 104857600 +checkpoint_keep_interval: 209715200 +fabric: + devices: 1 + precision: 16-mixed +train: + max_tokens: 2097152000 + grad_acc_tokens: 32768 + max_grad_norm: 1.0 + gradient_checkpointing: true + bias_weight_decay: false + normalization_weight_decay: false + conv_weight_decay: true +eval: + min_val_length: 512 +wandb: + project: forgetting-transformer + mode: online + log_dir: ./output/wandb diff --git a/configuration_transformer.py b/configuration_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..4b6767a5dcc859f307966491b13c2e44b35d8176 --- /dev/null +++ b/configuration_transformer.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class TransformerConfig(PretrainedConfig): + + model_type = 'transformer-project_fox' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + vocab_size: int = 32000, + hidden_size: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + num_hidden_layers: int = 24, + num_heads: int = 32, + num_kv_heads: int = None, + hidden_act: str = "swish", + window_size: Optional[int] = None, + max_position_embeddings: int = 2048, + initializer_range: float = 0.02, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + attention_bias: bool = False, + fuse_norm: bool = True, + fuse_cross_entropy: bool = True, + rope_base: float = 500000.0, + use_rope: bool = True, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.window_size = window_size + self.max_position_embeddings = max_position_embeddings + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.fuse_cross_entropy = fuse_cross_entropy + self.fuse_norm = fuse_norm + self.rope_base = rope_base + self.use_rope = use_rope + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/decay_params.txt b/decay_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..47c75c417ee07f215d4d4c8b1273995a5e9a6f5c --- /dev/null +++ b/decay_params.txt @@ -0,0 +1,44 @@ +_forward_module.model.embeddings.weight +_forward_module.model.layers.0.attn.q_proj.weight +_forward_module.model.layers.0.attn.k_proj.weight +_forward_module.model.layers.0.attn.v_proj.weight +_forward_module.model.layers.0.attn.o_proj.weight +_forward_module.model.layers.0.attn.fgate_proj.weight +_forward_module.model.layers.0.mlp.gate_proj.weight +_forward_module.model.layers.0.mlp.down_proj.weight +_forward_module.model.layers.1.attn.q_proj.weight +_forward_module.model.layers.1.attn.k_proj.weight +_forward_module.model.layers.1.attn.v_proj.weight +_forward_module.model.layers.1.attn.o_proj.weight +_forward_module.model.layers.1.attn.fgate_proj.weight +_forward_module.model.layers.1.mlp.gate_proj.weight +_forward_module.model.layers.1.mlp.down_proj.weight +_forward_module.model.layers.2.attn.q_proj.weight +_forward_module.model.layers.2.attn.k_proj.weight +_forward_module.model.layers.2.attn.v_proj.weight +_forward_module.model.layers.2.attn.o_proj.weight +_forward_module.model.layers.2.attn.fgate_proj.weight +_forward_module.model.layers.2.mlp.gate_proj.weight +_forward_module.model.layers.2.mlp.down_proj.weight +_forward_module.model.layers.3.attn.q_proj.weight +_forward_module.model.layers.3.attn.k_proj.weight +_forward_module.model.layers.3.attn.v_proj.weight +_forward_module.model.layers.3.attn.o_proj.weight +_forward_module.model.layers.3.attn.fgate_proj.weight +_forward_module.model.layers.3.mlp.gate_proj.weight +_forward_module.model.layers.3.mlp.down_proj.weight +_forward_module.model.layers.4.attn.q_proj.weight +_forward_module.model.layers.4.attn.k_proj.weight +_forward_module.model.layers.4.attn.v_proj.weight +_forward_module.model.layers.4.attn.o_proj.weight +_forward_module.model.layers.4.attn.fgate_proj.weight +_forward_module.model.layers.4.mlp.gate_proj.weight +_forward_module.model.layers.4.mlp.down_proj.weight +_forward_module.model.layers.5.attn.q_proj.weight +_forward_module.model.layers.5.attn.k_proj.weight +_forward_module.model.layers.5.attn.v_proj.weight +_forward_module.model.layers.5.attn.o_proj.weight +_forward_module.model.layers.5.attn.fgate_proj.weight +_forward_module.model.layers.5.mlp.gate_proj.weight +_forward_module.model.layers.5.mlp.down_proj.weight +_forward_module.lm_head.weight diff --git a/logs/2025-10-17_13-25-22.log b/logs/2025-10-17_13-25-22.log new file mode 100644 index 0000000000000000000000000000000000000000..343daeb0e48aad865172aa0b10afc1cf2ea49206 --- /dev/null +++ b/logs/2025-10-17_13-25-22.log @@ -0,0 +1,258 @@ +[2025-10-17 13:25:22][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/forgetting_gate_6_8_512` +[2025-10-17 13:25:22][train:375][INFO] Configuration: +[2025-10-17 13:25:22][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/config.yaml. +[2025-10-17 13:25:22][train:387][INFO] creating datamodule +[2025-10-17 13:25:22][train:419][INFO] creating model +[2025-10-17 13:25:22][train:440][INFO] creating optimizer +[2025-10-17 13:25:22][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints... +[2025-10-17 13:25:22][logger:256][INFO] Setting up wandb logger... +[2025-10-17 13:25:22][logger:272][INFO] Not resuming. Creating a new wandb run. +[2025-10-17 13:25:23][logger:288][INFO] wandb initialized. Run id: y8zione3 +[2025-10-17 13:25:23][logger:186][INFO] Setting up jsonlines logger... +[2025-10-17 13:25:23][logger:113][INFO] Setting up npz logger... +[2025-10-17 13:25:23][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024] +[2025-10-17 13:25:23][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1] +[2025-10-17 13:25:23][logger:171][INFO] [step: 0] [model_info/total_params: 71962160] [model_info/trainable_params: 71962160] [model_info/embedding_params: 25741824] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 46220336] +[2025-10-17 13:27:00][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:01:37] [ETA: 2:40:22] [loss: 8.766] [tokens/s: 224397.037] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:28:34][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:03:10] [ETA: 2:35:38] [loss: 7.474] [tokens/s: 224495.115] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:28:34][train:194][INFO] Running validation... +[2025-10-17 13:31:24][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 190.580] [val/train_update_time: 190.214] [val/loss: 7.451] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.258] [val/val_tokens_per_second: 240575.655] [val/loss_avg_len_2048: 7.451] [val/perplexity_len_2048: 1721.369] [val/loss_avg_len_1024: 7.449] [val/perplexity_len_1024: 1718.729] [val/loss_avg_len_512: 7.450] [val/perplexity_len_512: 1719.848] +[2025-10-17 13:32:57][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:07:34] [ETA: 4:04:46] [loss: 7.090] [tokens/s: 137862.779] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:34:31][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:09:07] [ETA: 3:39:02] [loss: 6.745] [tokens/s: 153012.985] [batches/s: 0.073] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:34:31][train:194][INFO] Running validation... +[2025-10-17 13:37:21][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 547.612] [val/train_update_time: 376.676] [val/loss: 6.732] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.363] [val/val_tokens_per_second: 240427.489] [val/loss_avg_len_2048: 6.732] [val/perplexity_len_2048: 839.103] [val/loss_avg_len_1024: 6.732] [val/perplexity_len_1024: 838.523] [val/loss_avg_len_512: 6.734] [val/perplexity_len_512: 840.603] +[2025-10-17 13:38:54][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:13:31] [ETA: 4:16:54] [loss: 6.485] [tokens/s: 128739.931] [batches/s: 0.061] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:38:54][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 811.291] [train_eval/train_update_time: 469.829] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 7.719] [train_eval/perplexity_len_2048: 2250.922] [train_eval/loss_avg_len_1024: 7.720] [train_eval/perplexity_len_1024: 2251.843] [train_eval/loss_avg_len_512: 7.719] [train_eval/perplexity_len_512: 2251.557] +[2025-10-17 13:40:28][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:15:04] [ETA: 3:56:12] [loss: 6.212] [tokens/s: 138789.400] [batches/s: 0.066] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:40:28][train:194][INFO] Running validation... +[2025-10-17 13:43:17][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 904.598] [val/train_update_time: 562.983] [val/loss: 6.203] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.996] [val/val_tokens_per_second: 242372.209] [val/loss_avg_len_2048: 6.203] [val/perplexity_len_2048: 494.013] [val/loss_avg_len_1024: 6.204] [val/perplexity_len_1024: 494.586] [val/loss_avg_len_512: 6.209] [val/perplexity_len_512: 497.257] +[2025-10-17 13:44:50][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:19:27] [ETA: 4:18:24] [loss: 6.016] [tokens/s: 125399.720] [batches/s: 0.060] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:46:24][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:21:00] [ETA: 4:01:34] [loss: 5.953] [tokens/s: 132828.780] [batches/s: 0.063] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:46:24][train:194][INFO] Running validation... +[2025-10-17 13:49:12][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 1260.372] [val/train_update_time: 749.470] [val/loss: 5.855] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.650] [val/val_tokens_per_second: 242870.421] [val/loss_avg_len_2048: 5.855] [val/perplexity_len_2048: 348.901] [val/loss_avg_len_1024: 5.857] [val/perplexity_len_1024: 349.847] [val/loss_avg_len_512: 5.866] [val/perplexity_len_512: 352.803] +[2025-10-17 13:50:46][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:25:22] [ETA: 4:16:33] [loss: 5.676] [tokens/s: 123657.351] [batches/s: 0.059] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:52:19][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:26:55] [ETA: 4:02:22] [loss: 5.544] [tokens/s: 129541.235] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:52:19][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1615.807] [train_eval/train_update_time: 935.960] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.958] [train_eval/perplexity_len_2048: 386.812] [train_eval/loss_avg_len_1024: 5.962] [train_eval/perplexity_len_1024: 388.314] [train_eval/loss_avg_len_512: 5.967] [train_eval/perplexity_len_512: 390.486] +[2025-10-17 13:52:19][train:194][INFO] Running validation... +[2025-10-17 13:55:09][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 1615.807] [val/train_update_time: 935.960] [val/loss: 5.550] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.848] [val/val_tokens_per_second: 241156.333] [val/loss_avg_len_2048: 5.550] [val/perplexity_len_2048: 257.167] [val/loss_avg_len_1024: 5.555] [val/perplexity_len_1024: 258.430] [val/loss_avg_len_512: 5.566] [val/perplexity_len_512: 261.382] +[2025-10-17 13:55:09][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000209715200.pt... +[2025-10-17 13:55:10][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000209715200.pt. +[2025-10-17 13:55:10][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 1.537] +[2025-10-17 13:56:44][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:31:20] [ETA: 4:13:34] [loss: 5.430] [tokens/s: 117039.835] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:58:17][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:32:53] [ETA: 4:01:15] [loss: 5.281] [tokens/s: 129451.197] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 13:58:17][train:194][INFO] Running validation... +[2025-10-17 14:01:07][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1973.999] [val/train_update_time: 1122.469] [val/loss: 5.291] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.914] [val/val_tokens_per_second: 241063.779] [val/loss_avg_len_2048: 5.291] [val/perplexity_len_2048: 198.547] [val/loss_avg_len_1024: 5.298] [val/perplexity_len_1024: 199.923] [val/loss_avg_len_512: 5.312] [val/perplexity_len_512: 202.815] +[2025-10-17 14:02:40][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:37:17] [ETA: 4:09:32] [loss: 5.194] [tokens/s: 117053.423] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:04:14][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:38:50] [ETA: 3:58:38] [loss: 5.120] [tokens/s: 129470.825] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:04:14][train:194][INFO] Running validation... +[2025-10-17 14:07:03][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 2330.883] [val/train_update_time: 1309.140] [val/loss: 5.083] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.715] [val/val_tokens_per_second: 242776.387] [val/loss_avg_len_2048: 5.083] [val/perplexity_len_2048: 161.221] [val/loss_avg_len_1024: 5.092] [val/perplexity_len_1024: 162.644] [val/loss_avg_len_512: 5.109] [val/perplexity_len_512: 165.552] +[2025-10-17 14:08:36][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:43:13] [ETA: 4:04:54] [loss: 4.975] [tokens/s: 117131.159] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:08:36][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2593.153] [train_eval/train_update_time: 1402.553] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.251] [train_eval/perplexity_len_2048: 190.849] [train_eval/loss_avg_len_1024: 5.258] [train_eval/perplexity_len_1024: 192.163] [train_eval/loss_avg_len_512: 5.272] [train_eval/perplexity_len_512: 194.825] +[2025-10-17 14:10:10][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:44:46] [ETA: 3:55:06] [loss: 4.917] [tokens/s: 129440.330] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:10:10][train:194][INFO] Running validation... +[2025-10-17 14:12:59][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 2686.886] [val/train_update_time: 1496.139] [val/loss: 4.900] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.974] [val/val_tokens_per_second: 242404.252] [val/loss_avg_len_2048: 4.900] [val/perplexity_len_2048: 134.324] [val/loss_avg_len_1024: 4.911] [val/perplexity_len_1024: 135.834] [val/loss_avg_len_512: 4.933] [val/perplexity_len_512: 138.750] +[2025-10-17 14:14:33][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:49:09] [ETA: 3:59:59] [loss: 4.807] [tokens/s: 117100.911] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:16:06][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:50:42] [ETA: 3:51:02] [loss: 4.759] [tokens/s: 129387.893] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:16:06][train:194][INFO] Running validation... +[2025-10-17 14:18:55][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 3042.959] [val/train_update_time: 1682.947] [val/loss: 4.740] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.961] [val/val_tokens_per_second: 242422.916] [val/loss_avg_len_2048: 4.740] [val/perplexity_len_2048: 114.410] [val/loss_avg_len_1024: 4.754] [val/perplexity_len_1024: 116.060] [val/loss_avg_len_512: 4.781] [val/perplexity_len_512: 119.175] +[2025-10-17 14:20:29][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:55:05] [ETA: 3:54:51] [loss: 4.722] [tokens/s: 117061.568] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:22:02][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:56:39] [ETA: 3:46:36] [loss: 4.612] [tokens/s: 129558.046] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:22:02][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3399.030] [train_eval/train_update_time: 1869.779] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.790] [train_eval/perplexity_len_2048: 120.269] [train_eval/loss_avg_len_1024: 4.804] [train_eval/perplexity_len_1024: 121.974] [train_eval/loss_avg_len_512: 4.828] [train_eval/perplexity_len_512: 125.018] +[2025-10-17 14:22:02][train:194][INFO] Running validation... +[2025-10-17 14:24:52][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 3399.030] [val/train_update_time: 1869.779] [val/loss: 4.606] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.336] [val/val_tokens_per_second: 241886.560] [val/loss_avg_len_2048: 4.606] [val/perplexity_len_2048: 100.052] [val/loss_avg_len_1024: 4.625] [val/perplexity_len_1024: 102.018] [val/loss_avg_len_512: 4.660] [val/perplexity_len_512: 105.621] +[2025-10-17 14:24:52][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000419430400.pt... +[2025-10-17 14:24:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000419430400.pt. +[2025-10-17 14:24:53][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 1.573] +[2025-10-17 14:26:27][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 1:01:03] [ETA: 3:49:41] [loss: 4.519] [tokens/s: 117061.899] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:28:00][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 1:02:36] [ETA: 3:41:59] [loss: 4.450] [tokens/s: 129465.568] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:28:00][train:194][INFO] Running validation... +[2025-10-17 14:30:49][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 3756.890] [val/train_update_time: 2056.446] [val/loss: 4.443] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.172] [val/val_tokens_per_second: 242121.101] [val/loss_avg_len_2048: 4.443] [val/perplexity_len_2048: 85.004] [val/loss_avg_len_1024: 4.471] [val/perplexity_len_1024: 87.458] [val/loss_avg_len_512: 4.518] [val/perplexity_len_512: 91.669] +[2025-10-17 14:32:23][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 1:06:59] [ETA: 3:44:16] [loss: 4.342] [tokens/s: 117110.986] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:33:56][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 1:08:32] [ETA: 3:37:04] [loss: 4.284] [tokens/s: 129434.957] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:33:56][train:194][INFO] Running validation... +[2025-10-17 14:36:46][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 4112.965] [val/train_update_time: 2243.070] [val/loss: 4.284] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.933] [val/val_tokens_per_second: 241036.785] [val/loss_avg_len_2048: 4.284] [val/perplexity_len_2048: 72.514] [val/loss_avg_len_1024: 4.324] [val/perplexity_len_1024: 75.494] [val/loss_avg_len_512: 4.386] [val/perplexity_len_512: 80.310] +[2025-10-17 14:38:20][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 1:12:56] [ETA: 3:38:49] [loss: 4.202] [tokens/s: 117042.739] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:38:20][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4376.394] [train_eval/train_update_time: 2336.412] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.402] [train_eval/perplexity_len_2048: 81.624] [train_eval/loss_avg_len_1024: 4.431] [train_eval/perplexity_len_1024: 84.024] [train_eval/loss_avg_len_512: 4.480] [train_eval/perplexity_len_512: 88.247] +[2025-10-17 14:39:53][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 1:14:30] [ETA: 3:32:02] [loss: 4.142] [tokens/s: 129367.533] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:39:53][train:194][INFO] Running validation... +[2025-10-17 14:42:44][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 4470.064] [val/train_update_time: 2429.937] [val/loss: 4.142] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.504] [val/val_tokens_per_second: 240229.366] [val/loss_avg_len_2048: 4.142] [val/perplexity_len_2048: 62.948] [val/loss_avg_len_1024: 4.191] [val/perplexity_len_1024: 66.065] [val/loss_avg_len_512: 4.262] [val/perplexity_len_512: 70.971] +[2025-10-17 14:44:17][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 1:18:54] [ETA: 3:33:19] [loss: 4.100] [tokens/s: 116945.670] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:45:51][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 1:20:27] [ETA: 3:26:54] [loss: 4.009] [tokens/s: 129229.881] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:45:51][train:194][INFO] Running validation... +[2025-10-17 14:48:41][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 4827.831] [val/train_update_time: 2616.888] [val/loss: 4.030] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.519] [val/val_tokens_per_second: 241625.372] [val/loss_avg_len_2048: 4.030] [val/perplexity_len_2048: 56.278] [val/loss_avg_len_1024: 4.085] [val/perplexity_len_1024: 59.439] [val/loss_avg_len_512: 4.164] [val/perplexity_len_512: 64.319] +[2025-10-17 14:50:14][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 1:24:50] [ETA: 3:27:44] [loss: 3.976] [tokens/s: 116893.693] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:51:48][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 1:26:24] [ETA: 3:21:37] [loss: 3.923] [tokens/s: 129324.004] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:51:48][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5184.687] [train_eval/train_update_time: 2803.937] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.067] [train_eval/perplexity_len_2048: 58.370] [train_eval/loss_avg_len_1024: 4.115] [train_eval/perplexity_len_1024: 61.243] [train_eval/loss_avg_len_512: 4.189] [train_eval/perplexity_len_512: 65.979] +[2025-10-17 14:51:48][train:194][INFO] Running validation... +[2025-10-17 14:54:39][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 5184.687] [val/train_update_time: 2803.937] [val/loss: 3.937] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.900] [val/val_tokens_per_second: 239671.834] [val/loss_avg_len_2048: 3.937] [val/perplexity_len_2048: 51.259] [val/loss_avg_len_1024: 3.996] [val/perplexity_len_1024: 54.360] [val/loss_avg_len_512: 4.079] [val/perplexity_len_512: 59.094] +[2025-10-17 14:54:39][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000629145600.pt... +[2025-10-17 14:54:40][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000629145600.pt. +[2025-10-17 14:54:40][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 1.536] +[2025-10-17 14:56:14][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 1:30:50] [ETA: 3:22:12] [loss: 3.941] [tokens/s: 116777.554] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:57:47][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 1:32:23] [ETA: 3:16:20] [loss: 3.851] [tokens/s: 129069.442] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 14:57:47][train:194][INFO] Running validation... +[2025-10-17 15:00:36][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 5543.976] [val/train_update_time: 2990.484] [val/loss: 3.873] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.915] [val/val_tokens_per_second: 242488.330] [val/loss_avg_len_2048: 3.873] [val/perplexity_len_2048: 48.090] [val/loss_avg_len_1024: 3.936] [val/perplexity_len_1024: 51.188] [val/loss_avg_len_512: 4.023] [val/perplexity_len_512: 55.892] +[2025-10-17 15:02:09][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:36:46] [ETA: 3:16:28] [loss: 3.870] [tokens/s: 116818.422] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:03:43][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:38:19] [ETA: 3:10:52] [loss: 3.811] [tokens/s: 129163.988] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:03:43][train:194][INFO] Running validation... +[2025-10-17 15:06:32][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 5899.682] [val/train_update_time: 3176.984] [val/loss: 3.807] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 168.984] [val/val_tokens_per_second: 242390.207] [val/loss_avg_len_2048: 3.807] [val/perplexity_len_2048: 44.993] [val/loss_avg_len_1024: 3.871] [val/perplexity_len_1024: 47.998] [val/loss_avg_len_512: 3.962] [val/perplexity_len_512: 52.542] +[2025-10-17 15:08:05][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:42:42] [ETA: 3:10:44] [loss: 3.776] [tokens/s: 116870.243] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:08:05][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6162.256] [train_eval/train_update_time: 3270.439] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.853] [train_eval/perplexity_len_2048: 47.149] [train_eval/loss_avg_len_1024: 3.912] [train_eval/perplexity_len_1024: 50.006] [train_eval/loss_avg_len_512: 3.999] [train_eval/perplexity_len_512: 54.532] +[2025-10-17 15:09:39][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:44:15] [ETA: 3:05:21] [loss: 3.728] [tokens/s: 129273.407] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:09:39][train:194][INFO] Running validation... +[2025-10-17 15:12:28][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 6255.944] [val/train_update_time: 3363.994] [val/loss: 3.765] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.211] [val/val_tokens_per_second: 242064.373] [val/loss_avg_len_2048: 3.765] [val/perplexity_len_2048: 43.158] [val/loss_avg_len_1024: 3.831] [val/perplexity_len_1024: 46.130] [val/loss_avg_len_512: 3.923] [val/perplexity_len_512: 50.564] +[2025-10-17 15:14:02][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:48:38] [ETA: 3:04:59] [loss: 3.727] [tokens/s: 116957.665] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:15:35][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:50:12] [ETA: 2:59:48] [loss: 3.700] [tokens/s: 129322.966] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:15:35][train:194][INFO] Running validation... +[2025-10-17 15:18:24][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 6612.111] [val/train_update_time: 3550.668] [val/loss: 3.712] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.164] [val/val_tokens_per_second: 242131.208] [val/loss_avg_len_2048: 3.712] [val/perplexity_len_2048: 40.924] [val/loss_avg_len_1024: 3.780] [val/perplexity_len_1024: 43.797] [val/loss_avg_len_512: 3.874] [val/perplexity_len_512: 48.126] +[2025-10-17 15:19:58][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:54:34] [ETA: 2:59:12] [loss: 3.699] [tokens/s: 117012.521] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:21:31][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:56:08] [ETA: 2:54:12] [loss: 3.581] [tokens/s: 129623.687] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:21:31][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6968.172] [train_eval/train_update_time: 3737.291] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.723] [train_eval/perplexity_len_2048: 41.377] [train_eval/loss_avg_len_1024: 3.785] [train_eval/perplexity_len_1024: 44.040] [train_eval/loss_avg_len_512: 3.879] [train_eval/perplexity_len_512: 48.372] +[2025-10-17 15:21:31][train:194][INFO] Running validation... +[2025-10-17 15:24:21][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 6968.172] [val/train_update_time: 3737.291] [val/loss: 3.682] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.108] [val/val_tokens_per_second: 240788.507] [val/loss_avg_len_2048: 3.682] [val/perplexity_len_2048: 39.729] [val/loss_avg_len_1024: 3.751] [val/perplexity_len_1024: 42.563] [val/loss_avg_len_512: 3.847] [val/perplexity_len_512: 46.837] +[2025-10-17 15:24:21][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000838860800.pt... +[2025-10-17 15:24:23][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000000838860800.pt. +[2025-10-17 15:24:23][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 1.590] +[2025-10-17 15:25:56][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 2:00:33] [ETA: 2:53:28] [loss: 3.623] [tokens/s: 117076.347] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:27:30][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 2:02:06] [ETA: 2:48:38] [loss: 3.624] [tokens/s: 129380.861] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:27:30][train:194][INFO] Running validation... +[2025-10-17 15:30:21][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 7326.923] [val/train_update_time: 3924.072] [val/loss: 3.641] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.913] [val/val_tokens_per_second: 239654.332] [val/loss_avg_len_2048: 3.641] [val/perplexity_len_2048: 38.130] [val/loss_avg_len_1024: 3.710] [val/perplexity_len_1024: 40.872] [val/loss_avg_len_512: 3.807] [val/perplexity_len_512: 45.009] +[2025-10-17 15:31:54][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 2:06:31] [ETA: 2:47:42] [loss: 3.623] [tokens/s: 116917.429] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:33:28][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 2:08:04] [ETA: 2:43:00] [loss: 3.657] [tokens/s: 129213.526] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:33:28][train:194][INFO] Running validation... +[2025-10-17 15:36:18][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 7684.845] [val/train_update_time: 4110.783] [val/loss: 3.620] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.392] [val/val_tokens_per_second: 240386.300] [val/loss_avg_len_2048: 3.620] [val/perplexity_len_2048: 37.324] [val/loss_avg_len_1024: 3.689] [val/perplexity_len_1024: 40.005] [val/loss_avg_len_512: 3.786] [val/perplexity_len_512: 44.090] +[2025-10-17 15:37:52][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 2:12:28] [ETA: 2:41:55] [loss: 3.605] [tokens/s: 116818.084] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:37:52][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7948.908] [train_eval/train_update_time: 4204.297] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.631] [train_eval/perplexity_len_2048: 37.762] [train_eval/loss_avg_len_1024: 3.699] [train_eval/perplexity_len_1024: 40.392] [train_eval/loss_avg_len_512: 3.795] [train_eval/perplexity_len_512: 44.500] +[2025-10-17 15:39:26][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 2:14:02] [ETA: 2:37:21] [loss: 3.573] [tokens/s: 129107.341] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:39:26][train:194][INFO] Running validation... +[2025-10-17 15:42:15][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 8042.599] [val/train_update_time: 4297.832] [val/loss: 3.585] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 169.110] [val/val_tokens_per_second: 242208.947] [val/loss_avg_len_2048: 3.585] [val/perplexity_len_2048: 36.038] [val/loss_avg_len_1024: 3.656] [val/perplexity_len_1024: 38.693] [val/loss_avg_len_512: 3.753] [val/perplexity_len_512: 42.667] +[2025-10-17 15:43:48][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 2:18:25] [ETA: 2:36:05] [loss: 3.553] [tokens/s: 116819.368] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:45:22][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 2:19:58] [ETA: 2:31:38] [loss: 3.564] [tokens/s: 129099.142] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:45:22][train:194][INFO] Running validation... +[2025-10-17 15:48:12][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 8398.813] [val/train_update_time: 4484.626] [val/loss: 3.566] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.013] [val/val_tokens_per_second: 240922.848] [val/loss_avg_len_2048: 3.566] [val/perplexity_len_2048: 35.358] [val/loss_avg_len_1024: 3.637] [val/perplexity_len_1024: 37.974] [val/loss_avg_len_512: 3.736] [val/perplexity_len_512: 41.931] +[2025-10-17 15:49:45][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 2:24:22] [ETA: 2:30:15] [loss: 3.555] [tokens/s: 116751.372] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:51:19][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 2:25:55] [ETA: 2:25:55] [loss: 3.536] [tokens/s: 129215.693] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:51:19][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8755.968] [train_eval/train_update_time: 4671.454] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.567] [train_eval/perplexity_len_2048: 35.405] [train_eval/loss_avg_len_1024: 3.633] [train_eval/perplexity_len_1024: 37.841] [train_eval/loss_avg_len_512: 3.731] [train_eval/perplexity_len_512: 41.730] +[2025-10-17 15:51:19][train:194][INFO] Running validation... +[2025-10-17 15:54:09][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 8755.968] [val/train_update_time: 4671.454] [val/loss: 3.541] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.222] [val/val_tokens_per_second: 240627.488] [val/loss_avg_len_2048: 3.541] [val/perplexity_len_2048: 34.498] [val/loss_avg_len_1024: 3.612] [val/perplexity_len_1024: 37.051] [val/loss_avg_len_512: 3.711] [val/perplexity_len_512: 40.912] +[2025-10-17 15:54:09][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001048576000.pt... +[2025-10-17 15:54:11][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001048576000.pt. +[2025-10-17 15:54:11][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 1.610] +[2025-10-17 15:55:44][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 2:30:21] [ETA: 2:24:27] [loss: 3.505] [tokens/s: 116730.679] [batches/s: 0.056] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:57:18][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 2:31:54] [ETA: 2:20:13] [loss: 3.520] [tokens/s: 129159.943] [batches/s: 0.062] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 15:57:18][train:194][INFO] Running validation... +[2025-10-17 16:00:08][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 9114.626] [val/train_update_time: 4857.963] [val/loss: 3.524] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 170.630] [val/val_tokens_per_second: 240051.300] [val/loss_avg_len_2048: 3.524] [val/perplexity_len_2048: 33.919] [val/loss_avg_len_1024: 3.595] [val/perplexity_len_1024: 36.430] [val/loss_avg_len_512: 3.695] [val/perplexity_len_512: 40.238] +[2025-10-17 16:02:02][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 2:36:38] [ETA: 2:18:54] [loss: 3.498] [tokens/s: 115509.315] [batches/s: 0.055] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:05:29][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 2:40:06] [ETA: 2:16:22] [loss: 3.512] [tokens/s: 119216.261] [batches/s: 0.057] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:05:29][train:194][INFO] Running validation... +[2025-10-17 16:09:02][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 9606.105] [val/train_update_time: 5178.119] [val/loss: 3.503] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 213.205] [val/val_tokens_per_second: 192115.546] [val/loss_avg_len_2048: 3.503] [val/perplexity_len_2048: 33.229] [val/loss_avg_len_1024: 3.576] [val/perplexity_len_1024: 35.717] [val/loss_avg_len_512: 3.676] [val/perplexity_len_512: 39.474] +[2025-10-17 16:12:07][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 2:46:43] [ETA: 2:16:24] [loss: 3.468] [tokens/s: 101518.231] [batches/s: 0.048] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:12:07][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10003.422] [train_eval/train_update_time: 5362.020] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.510] [train_eval/perplexity_len_2048: 33.445] [train_eval/loss_avg_len_1024: 3.577] [train_eval/perplexity_len_1024: 35.773] [train_eval/loss_avg_len_512: 3.674] [train_eval/perplexity_len_512: 39.420] +[2025-10-17 16:14:34][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 2:49:10] [ETA: 2:12:55] [loss: 3.506] [tokens/s: 107603.473] [batches/s: 0.051] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:14:34][train:194][INFO] Running validation... +[2025-10-17 16:18:40][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 10150.525] [val/train_update_time: 5508.942] [val/loss: 3.486] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.225] [val/val_tokens_per_second: 166351.655] [val/loss_avg_len_2048: 3.486] [val/perplexity_len_2048: 32.655] [val/loss_avg_len_1024: 3.559] [val/perplexity_len_1024: 35.114] [val/loss_avg_len_512: 3.659] [val/perplexity_len_512: 38.821] +[2025-10-17 16:21:30][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 2:56:06] [ETA: 2:12:50] [loss: 3.430] [tokens/s: 92210.553] [batches/s: 0.044] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:24:44][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 2:59:20] [ETA: 2:09:52] [loss: 3.490] [tokens/s: 95132.421] [batches/s: 0.045] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:24:44][train:194][INFO] Running validation... +[2025-10-17 16:28:32][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 10760.647] [val/train_update_time: 5872.135] [val/loss: 3.472] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 227.817] [val/val_tokens_per_second: 179793.158] [val/loss_avg_len_2048: 3.472] [val/perplexity_len_2048: 32.191] [val/loss_avg_len_1024: 3.544] [val/perplexity_len_1024: 34.622] [val/loss_avg_len_512: 3.645] [val/perplexity_len_512: 38.292] +[2025-10-17 16:31:19][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 3:05:56] [ETA: 2:09:12] [loss: 3.506] [tokens/s: 83562.533] [batches/s: 0.040] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:33:54][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 3:08:30] [ETA: 2:05:40] [loss: 3.509] [tokens/s: 87483.728] [batches/s: 0.042] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:33:54][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 11310.366] [train_eval/train_update_time: 6193.654] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.469] [train_eval/perplexity_len_2048: 32.111] [train_eval/loss_avg_len_1024: 3.535] [train_eval/perplexity_len_1024: 34.294] [train_eval/loss_avg_len_512: 3.635] [train_eval/perplexity_len_512: 37.889] +[2025-10-17 16:33:54][train:194][INFO] Running validation... +[2025-10-17 16:37:52][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 11310.366] [val/train_update_time: 6193.654] [val/loss: 3.458] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 238.131] [val/val_tokens_per_second: 172006.003] [val/loss_avg_len_2048: 3.458] [val/perplexity_len_2048: 31.754] [val/loss_avg_len_1024: 3.531] [val/perplexity_len_1024: 34.160] [val/loss_avg_len_512: 3.632] [val/perplexity_len_512: 37.792] +[2025-10-17 16:37:52][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001258291200.pt... +[2025-10-17 16:37:54][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001258291200.pt. +[2025-10-17 16:37:54][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 1.537] +[2025-10-17 16:41:00][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 3:15:36] [ETA: 2:05:03] [loss: 3.467] [tokens/s: 76720.992] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:43:57][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 3:18:34] [ETA: 2:01:42] [loss: 3.441] [tokens/s: 79259.061] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:43:57][train:194][INFO] Running validation... +[2025-10-17 16:48:04][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 11914.081] [val/train_update_time: 6556.691] [val/loss: 3.444] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.788] [val/val_tokens_per_second: 165972.114] [val/loss_avg_len_2048: 3.444] [val/perplexity_len_2048: 31.324] [val/loss_avg_len_1024: 3.517] [val/perplexity_len_1024: 33.696] [val/loss_avg_len_512: 3.619] [val/perplexity_len_512: 37.285] +[2025-10-17 16:50:31][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 3:25:08] [ETA: 2:00:28] [loss: 3.455] [tokens/s: 71931.548] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:53:27][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 3:28:03] [ETA: 1:57:02] [loss: 3.447] [tokens/s: 78608.237] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 16:53:27][train:194][INFO] Running validation... +[2025-10-17 16:57:08][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 12483.778] [val/train_update_time: 6879.229] [val/loss: 3.434] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 220.318] [val/val_tokens_per_second: 185912.978] [val/loss_avg_len_2048: 3.434] [val/perplexity_len_2048: 30.985] [val/loss_avg_len_1024: 3.507] [val/perplexity_len_1024: 33.349] [val/loss_avg_len_512: 3.608] [val/perplexity_len_512: 36.904] +[2025-10-17 17:00:29][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 3:35:05] [ETA: 1:55:49] [loss: 3.410] [tokens/s: 71902.990] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:00:29][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 12905.641] [train_eval/train_update_time: 7080.244] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.438] [train_eval/perplexity_len_2048: 31.119] [train_eval/loss_avg_len_1024: 3.509] [train_eval/perplexity_len_1024: 33.403] [train_eval/loss_avg_len_512: 3.608] [train_eval/perplexity_len_512: 36.877] +[2025-10-17 17:03:12][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 3:37:48] [ETA: 1:52:12] [loss: 3.453] [tokens/s: 78132.660] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:03:12][train:194][INFO] Running validation... +[2025-10-17 17:07:17][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 13068.739] [val/train_update_time: 7242.851] [val/loss: 3.424] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 245.131] [val/val_tokens_per_second: 167094.665] [val/loss_avg_len_2048: 3.424] [val/perplexity_len_2048: 30.689] [val/loss_avg_len_1024: 3.498] [val/perplexity_len_1024: 33.044] [val/loss_avg_len_512: 3.599] [val/perplexity_len_512: 36.570] +[2025-10-17 17:09:44][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 3:44:21] [ETA: 1:50:30] [loss: 3.382] [tokens/s: 72309.301] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:12:55][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 3:47:31] [ETA: 1:47:04] [loss: 3.413] [tokens/s: 78647.123] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:12:55][train:194][INFO] Running validation... +[2025-10-17 17:16:21][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 13651.783] [val/train_update_time: 7580.372] [val/loss: 3.414] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 205.477] [val/val_tokens_per_second: 199341.303] [val/loss_avg_len_2048: 3.414] [val/perplexity_len_2048: 30.372] [val/loss_avg_len_1024: 3.487] [val/perplexity_len_1024: 32.694] [val/loss_avg_len_512: 3.589] [val/perplexity_len_512: 36.202] +[2025-10-17 17:19:55][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 3:54:31] [ETA: 1:45:21] [loss: 3.416] [tokens/s: 71588.559] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:22:25][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 3:57:01] [ETA: 1:41:34] [loss: 3.415] [tokens/s: 78156.638] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:22:25][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 14221.495] [train_eval/train_update_time: 7943.600] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.410] [train_eval/perplexity_len_2048: 30.263] [train_eval/loss_avg_len_1024: 3.481] [train_eval/perplexity_len_1024: 32.496] [train_eval/loss_avg_len_512: 3.582] [train_eval/perplexity_len_512: 35.944] +[2025-10-17 17:22:25][train:194][INFO] Running validation... +[2025-10-17 17:26:31][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 14221.495] [val/train_update_time: 7943.600] [val/loss: 3.405] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.159] [val/val_tokens_per_second: 166396.303] [val/loss_avg_len_2048: 3.405] [val/perplexity_len_2048: 30.103] [val/loss_avg_len_1024: 3.478] [val/perplexity_len_1024: 32.408] [val/loss_avg_len_512: 3.581] [val/perplexity_len_512: 35.893] +[2025-10-17 17:26:31][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001468006400.pt... +[2025-10-17 17:26:32][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001468006400.pt. +[2025-10-17 17:26:32][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 1.549] +[2025-10-17 17:28:59][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 4:03:36] [ETA: 1:39:29] [loss: 3.425] [tokens/s: 72699.850] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:32:25][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 4:07:01] [ETA: 1:36:03] [loss: 3.394] [tokens/s: 78465.256] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:32:25][train:194][INFO] Running validation... +[2025-10-17 17:35:39][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 14821.624] [val/train_update_time: 8295.639] [val/loss: 3.396] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 194.166] [val/val_tokens_per_second: 210954.038] [val/loss_avg_len_2048: 3.396] [val/perplexity_len_2048: 29.849] [val/loss_avg_len_1024: 3.470] [val/perplexity_len_1024: 32.135] [val/loss_avg_len_512: 3.572] [val/perplexity_len_512: 35.591] +[2025-10-17 17:39:16][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 4:13:52] [ETA: 1:33:54] [loss: 3.416] [tokens/s: 71341.749] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:41:43][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 4:16:20] [ETA: 1:30:03] [loss: 3.377] [tokens/s: 78020.639] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:41:43][train:194][INFO] Running validation... +[2025-10-17 17:45:50][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 15380.239] [val/train_update_time: 8659.388] [val/loss: 3.390] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.372] [val/val_tokens_per_second: 166252.603] [val/loss_avg_len_2048: 3.390] [val/perplexity_len_2048: 29.654] [val/loss_avg_len_1024: 3.464] [val/perplexity_len_1024: 31.933] [val/loss_avg_len_512: 3.566] [val/perplexity_len_512: 35.362] +[2025-10-17 17:48:17][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 4:22:53] [ETA: 1:27:37] [loss: 3.388] [tokens/s: 72991.067] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:48:17][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 15773.814] [train_eval/train_update_time: 8806.402] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.390] [train_eval/perplexity_len_2048: 29.661] [train_eval/loss_avg_len_1024: 3.460] [train_eval/perplexity_len_1024: 31.831] [train_eval/loss_avg_len_512: 3.562] [train_eval/perplexity_len_512: 35.234] +[2025-10-17 17:51:52][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 4:26:28] [ETA: 1:24:08] [loss: 3.337] [tokens/s: 78063.028] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 17:51:52][train:194][INFO] Running validation... +[2025-10-17 17:54:59][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 15988.253] [val/train_update_time: 9020.619] [val/loss: 3.383] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 186.935] [val/val_tokens_per_second: 219113.538] [val/loss_avg_len_2048: 3.383] [val/perplexity_len_2048: 29.460] [val/loss_avg_len_1024: 3.457] [val/perplexity_len_1024: 31.724] [val/loss_avg_len_512: 3.559] [val/perplexity_len_512: 35.144] +[2025-10-17 17:58:35][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 4:33:12] [ETA: 1:21:36] [loss: 3.408] [tokens/s: 71192.916] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:01:03][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 4:35:39] [ETA: 1:17:44] [loss: 3.354] [tokens/s: 77841.964] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:01:03][train:194][INFO] Running validation... +[2025-10-17 18:05:09][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 16539.340] [val/train_update_time: 9384.053] [val/loss: 3.378] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.441] [val/val_tokens_per_second: 166206.356] [val/loss_avg_len_2048: 3.378] [val/perplexity_len_2048: 29.302] [val/loss_avg_len_1024: 3.452] [val/perplexity_len_1024: 31.559] [val/loss_avg_len_512: 3.554] [val/perplexity_len_512: 34.959] +[2025-10-17 18:07:36][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 4:42:12] [ETA: 1:15:01] [loss: 3.369] [tokens/s: 73013.636] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:11:13][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 4:45:49] [ETA: 1:11:27] [loss: 3.356] [tokens/s: 77890.611] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:11:13][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 17149.438] [train_eval/train_update_time: 9747.288] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.374] [train_eval/perplexity_len_2048: 29.198] [train_eval/loss_avg_len_1024: 3.447] [train_eval/perplexity_len_1024: 31.396] [train_eval/loss_avg_len_512: 3.547] [train_eval/perplexity_len_512: 34.724] +[2025-10-17 18:11:13][train:194][INFO] Running validation... +[2025-10-17 18:14:24][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 17149.438] [val/train_update_time: 9747.288] [val/loss: 3.372] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 191.129] [val/val_tokens_per_second: 214305.318] [val/loss_avg_len_2048: 3.372] [val/perplexity_len_2048: 29.146] [val/loss_avg_len_1024: 3.446] [val/perplexity_len_1024: 31.388] [val/loss_avg_len_512: 3.549] [val/perplexity_len_512: 34.773] +[2025-10-17 18:14:24][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001677721600.pt... +[2025-10-17 18:14:25][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001677721600.pt. +[2025-10-17 18:14:25][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 1.572] +[2025-10-17 18:17:52][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 4:52:29] [ETA: 1:08:36] [loss: 3.336] [tokens/s: 71141.408] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:20:20][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 4:54:56] [ETA: 1:04:44] [loss: 3.345] [tokens/s: 77883.412] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:20:20][train:194][INFO] Running validation... +[2025-10-17 18:24:26][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 17696.359] [val/train_update_time: 10101.101] [val/loss: 3.368] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.331] [val/val_tokens_per_second: 166280.514] [val/loss_avg_len_2048: 3.368] [val/perplexity_len_2048: 29.029] [val/loss_avg_len_1024: 3.442] [val/perplexity_len_1024: 31.262] [val/loss_avg_len_512: 3.545] [val/perplexity_len_512: 34.640] +[2025-10-17 18:26:53][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 5:01:29] [ETA: 1:01:45] [loss: 3.380] [tokens/s: 73049.408] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:30:30][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 5:05:06] [ETA: 0:58:07] [loss: 3.310] [tokens/s: 77887.981] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:30:30][train:194][INFO] Running validation... +[2025-10-17 18:33:52][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 18306.999] [val/train_update_time: 10465.017] [val/loss: 3.365] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 201.660] [val/val_tokens_per_second: 203114.557] [val/loss_avg_len_2048: 3.365] [val/perplexity_len_2048: 28.921] [val/loss_avg_len_1024: 3.439] [val/perplexity_len_1024: 31.146] [val/loss_avg_len_512: 3.541] [val/perplexity_len_512: 34.510] +[2025-10-17 18:37:09][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 5:11:46] [ETA: 0:55:01] [loss: 3.388] [tokens/s: 71159.479] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:37:09][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 18706.207] [train_eval/train_update_time: 10662.364] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.357] [train_eval/perplexity_len_2048: 28.697] [train_eval/loss_avg_len_1024: 3.422] [train_eval/perplexity_len_1024: 30.643] [train_eval/loss_avg_len_512: 3.524] [train_eval/perplexity_len_512: 33.909] +[2025-10-17 18:39:37][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 5:14:13] [ETA: 0:51:09] [loss: 3.372] [tokens/s: 78216.845] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:39:37][train:194][INFO] Running validation... +[2025-10-17 18:43:41][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 18853.344] [val/train_update_time: 10809.319] [val/loss: 3.361] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 244.710] [val/val_tokens_per_second: 167382.117] [val/loss_avg_len_2048: 3.361] [val/perplexity_len_2048: 28.831] [val/loss_avg_len_1024: 3.436] [val/perplexity_len_1024: 31.054] [val/loss_avg_len_512: 3.538] [val/perplexity_len_512: 34.410] +[2025-10-17 18:46:18][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 5:20:54] [ETA: 0:47:57] [loss: 3.314] [tokens/s: 72916.286] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:49:45][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 5:24:22] [ETA: 0:44:13] [loss: 3.329] [tokens/s: 78002.026] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:49:45][train:194][INFO] Running validation... +[2025-10-17 18:53:17][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 19462.212] [val/train_update_time: 11172.773] [val/loss: 3.359] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 212.012] [val/val_tokens_per_second: 193196.951] [val/loss_avg_len_2048: 3.359] [val/perplexity_len_2048: 28.760] [val/loss_avg_len_1024: 3.433] [val/perplexity_len_1024: 30.976] [val/loss_avg_len_512: 3.536] [val/perplexity_len_512: 34.325] +[2025-10-17 18:56:21][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 5:30:58] [ETA: 0:40:54] [loss: 3.398] [tokens/s: 71332.528] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:58:48][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 5:33:24] [ETA: 0:37:02] [loss: 3.310] [tokens/s: 78663.114] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 18:58:48][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 20004.831] [train_eval/train_update_time: 11502.972] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.352] [train_eval/perplexity_len_2048: 28.574] [train_eval/loss_avg_len_1024: 3.421] [train_eval/perplexity_len_1024: 30.612] [train_eval/loss_avg_len_512: 3.523] [train_eval/perplexity_len_512: 33.884] +[2025-10-17 18:58:48][train:194][INFO] Running validation... +[2025-10-17 19:02:54][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 20004.831] [val/train_update_time: 11502.972] [val/loss: 3.357] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 246.356] [val/val_tokens_per_second: 166263.403] [val/loss_avg_len_2048: 3.357] [val/perplexity_len_2048: 28.702] [val/loss_avg_len_1024: 3.431] [val/perplexity_len_1024: 30.916] [val/loss_avg_len_512: 3.534] [val/perplexity_len_512: 34.260] +[2025-10-17 19:02:54][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001887436800.pt... +[2025-10-17 19:02:56][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/forgetting_gate_6_8_512/checkpoints/step-000001887436800.pt. +[2025-10-17 19:02:56][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 1.561] +[2025-10-17 19:05:47][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 5:40:24] [ETA: 0:33:39] [loss: 3.360] [tokens/s: 72590.713] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:09:00][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 5:43:36] [ETA: 0:29:52] [loss: 3.372] [tokens/s: 78076.895] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:09:00][train:194][INFO] Running validation... +[2025-10-17 19:12:48][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 20616.579] [val/train_update_time: 11866.318] [val/loss: 3.355] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 228.491] [val/val_tokens_per_second: 179262.736] [val/loss_avg_len_2048: 3.355] [val/perplexity_len_2048: 28.659] [val/loss_avg_len_1024: 3.430] [val/perplexity_len_1024: 30.869] [val/loss_avg_len_512: 3.532] [val/perplexity_len_512: 34.208] +[2025-10-17 19:15:34][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 5:50:10] [ETA: 0:26:21] [loss: 3.365] [tokens/s: 71631.164] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:18:10][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 5:52:46] [ETA: 0:22:31] [loss: 3.320] [tokens/s: 78793.199] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:18:10][train:194][INFO] Running validation... +[2025-10-17 19:22:07][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 21166.987] [val/train_update_time: 12187.850] [val/loss: 3.354] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 236.168] [val/val_tokens_per_second: 173436.197] [val/loss_avg_len_2048: 3.354] [val/perplexity_len_2048: 28.631] [val/loss_avg_len_1024: 3.429] [val/perplexity_len_1024: 30.840] [val/loss_avg_len_512: 3.531] [val/perplexity_len_512: 34.175] +[2025-10-17 19:25:13][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 5:59:49] [ETA: 0:18:56] [loss: 3.327] [tokens/s: 72375.858] [batches/s: 0.035] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:25:13][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 21589.576] [train_eval/train_update_time: 12373.759] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 3.342] [train_eval/perplexity_len_2048: 28.287] [train_eval/loss_avg_len_1024: 3.414] [train_eval/perplexity_len_1024: 30.397] [train_eval/loss_avg_len_512: 3.514] [train_eval/perplexity_len_512: 33.585] +[2025-10-17 19:28:11][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 6:02:47] [ETA: 0:15:06] [loss: 3.331] [tokens/s: 78202.226] [batches/s: 0.037] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:28:11][train:194][INFO] Running validation... +[2025-10-17 19:32:16][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 21767.690] [val/train_update_time: 12551.348] [val/loss: 3.354] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 244.702] [val/val_tokens_per_second: 167386.972] [val/loss_avg_len_2048: 3.354] [val/perplexity_len_2048: 28.614] [val/loss_avg_len_1024: 3.428] [val/perplexity_len_1024: 30.821] [val/loss_avg_len_512: 3.531] [val/perplexity_len_512: 34.155] +[2025-10-17 19:34:43][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 6:09:19] [ETA: 0:11:25] [loss: 3.359] [tokens/s: 72048.637] [batches/s: 0.034] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:37:39][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 6:12:15] [ETA: 0:07:35] [loss: 3.334] [tokens/s: 78695.239] [batches/s: 0.038] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-17 19:37:39][train:194][INFO] Running validation... +[2025-10-17 19:41:19][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 22335.749] [val/train_update_time: 12874.330] [val/loss: 3.354] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 220.122] [val/val_tokens_per_second: 186078.880] [val/loss_avg_len_2048: 3.354] [val/perplexity_len_2048: 28.606] [val/loss_avg_len_1024: 3.428] [val/perplexity_len_1024: 30.812] [val/loss_avg_len_512: 3.531] [val/perplexity_len_512: 34.145] +[2025-10-17 19:41:19][train:854][INFO] Training finished with 2055208960 tokens! diff --git a/metrics/jsonlines/checkpoint.jsonl b/metrics/jsonlines/checkpoint.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2c18902318d6638492dd8aa9d1d2a6fd5620d7a1 --- /dev/null +++ b/metrics/jsonlines/checkpoint.jsonl @@ -0,0 +1,9 @@ +{"step": 209715200, "checkpoint/checkpoint_time": 1.5373147319769487} +{"step": 419430400, "checkpoint/checkpoint_time": 1.572724198922515} +{"step": 629145600, "checkpoint/checkpoint_time": 1.5360865560360253} +{"step": 838860800, "checkpoint/checkpoint_time": 1.5895927330711856} +{"step": 1048576000, "checkpoint/checkpoint_time": 1.6095300159649923} +{"step": 1258291200, "checkpoint/checkpoint_time": 1.5371675649657845} +{"step": 1468006400, "checkpoint/checkpoint_time": 1.5488120779627934} +{"step": 1677721600, "checkpoint/checkpoint_time": 1.5724514800822362} +{"step": 1887436800, "checkpoint/checkpoint_time": 1.5613262739498168} diff --git a/metrics/jsonlines/model_info.jsonl b/metrics/jsonlines/model_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bd3c5385dac56f1ecda5ddcfd7ea3a57ea68895e --- /dev/null +++ b/metrics/jsonlines/model_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "model_info/total_params": 71962160, "model_info/trainable_params": 71962160, "model_info/embedding_params": 25741824, "model_info/flops_per_token": 0, "model_info/non_embedding_params": 46220336} diff --git a/metrics/jsonlines/norm.jsonl b/metrics/jsonlines/norm.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e0adee361653c998e315cb4914fac4d70e34793f --- /dev/null +++ b/metrics/jsonlines/norm.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "pnorm/_forward_module.model.embeddings.weight": 101.81584930419922, "gnorm/_forward_module.model.embeddings.weight": 0.06869454681873322, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.617353439331055, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0016509891720488667, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 10.29729175567627, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.004574252292513847, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 10.30891227722168, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.004276220686733723, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.288520812988281, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04839729890227318, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.308958053588867, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.045711617916822433, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.290590524673462, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0063759456388652325, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.009432925842702389, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0029194196686148643, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.644020080566406, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0010992292081937194, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 25.261024475097656, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03170783072710037, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 17.826223373413086, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.023950468748807907, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.61376953125, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0010983363026753068, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.277763366699219, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0028180419467389584, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 10.29810905456543, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.002785311546176672, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.281525611877441, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.029189206659793854, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.294316291809082, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.029758155345916748, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.2956374883651733, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012822494842112064, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0038774865679442883, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0011934550711885095, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.629898071289062, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008333436562679708, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 25.243446350097656, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.021816885098814964, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 17.836523056030273, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.016327911987900734, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.617870330810547, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0007926493999548256, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 10.331307411193848, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.0019470715196803212, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 10.299418449401855, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.0018506657797843218, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.290188789367676, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.022145478054881096, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.265390396118164, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.028168268501758575, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.2813464403152466, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010303563438355923, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00432300241664052, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006671957089565694, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.625865936279297, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005588842905126512, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 25.246349334716797, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.01609724946320057, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 17.8576717376709, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.013793759979307652, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.62416648864746, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0006023735622875392, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 10.327676773071289, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0012406462337821722, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 10.303221702575684, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.0011031782487407327, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.276890754699707, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.0187190230935812, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.281161308288574, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02795841544866562, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.2862451076507568, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.007216785568743944, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.003768933704122901, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0004013867874164134, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.627944946289062, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.00043084961362183094, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 25.22796630859375, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.01563522219657898, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 17.861671447753906, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.01501774974167347, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.628366470336914, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.000659311655908823, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 10.318748474121094, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.0006316258222796023, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 10.284245491027832, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.0005636459682136774, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.28686237335205, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.018689164891839027, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.314923286437988, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.029897956177592278, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.2906819581985474, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.004329283721745014, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.004539692308753729, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0002095368254231289, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.633237838745117, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005545919993892312, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 25.215951919555664, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.01834973506629467, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 17.84881019592285, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.0184263177216053, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.629703521728516, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0008004084811545908, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 10.302168846130371, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.0003756276855710894, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 10.29596996307373, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.0003510359092615545, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.293839454650879, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.022132596001029015, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.319414138793945, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03259901702404022, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.3089790344238281, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0027386643923819065, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.004615094978362322, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00012222363147884607, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.633485794067383, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0007963152602314949, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 25.23259162902832, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02013680897653103, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 17.823854446411133, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.020861024037003517, "pnorm/_forward_module.model.norm.weight": 22.66058349609375, "gnorm/_forward_module.model.norm.weight": 0.0569557286798954, "pnorm/_forward_module.lm_head.weight": 101.96174621582031, "gnorm/_forward_module.lm_head.weight": 0.9876343607902527} +{"step": 41943040, "pnorm/_forward_module.model.embeddings.weight": 103.18702697753906, "gnorm/_forward_module.model.embeddings.weight": 0.19666098058223724, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.655452728271484, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.007538936100900173, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 10.683125495910645, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009629827924072742, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 10.70488166809082, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.00856009405106306, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.505414009094238, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.19911183416843414, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.514557838439941, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.20896027982234955, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.362101674079895, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.010439998470246792, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.03535962104797363, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0011666429927572608, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.68776512145996, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.006809588987380266, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 25.788516998291016, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.17908670008182526, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 18.19399642944336, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.14734670519828796, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.609352111816406, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0055291480384767056, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.484845161437988, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.009505169466137886, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 10.512873649597168, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.008511153049767017, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.437174797058105, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1525234878063202, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.449408531188965, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.14439615607261658, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.3927792310714722, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007044985890388489, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0062977164052426815, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003885702171828598, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.64682388305664, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.003475162200629711, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 25.705564498901367, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0988364964723587, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.177331924438477, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.08906685560941696, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.60630989074707, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.00349959684535861, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 10.539627075195312, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.006151949055492878, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 10.507065773010254, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.005213597323745489, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.443143844604492, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.09138711541891098, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.419620513916016, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.08430248498916626, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.360091209411621, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.005488235969096422, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.010254627093672752, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00031645759008824825, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.622915267944336, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.002194090746343136, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 25.669544219970703, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.06746447831392288, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.16973876953125, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.05668458715081215, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.611255645751953, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002679024590179324, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 10.506802558898926, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005259388126432896, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 10.47215747833252, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005143946968019009, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.437226295471191, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06982395052909851, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.446187973022461, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.0627862885594368, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.3637146949768066, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006106208544224501, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.009373653680086136, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00021486994228325784, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.610443115234375, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0017554149962961674, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 25.63470458984375, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05413088575005531, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.16493034362793, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04722768813371658, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.614177703857422, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.002250468358397484, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 10.479959487915039, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.0038514207117259502, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 10.443544387817383, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.003003370249643922, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.455085754394531, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.05936587601900101, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.482765197753906, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.06289776414632797, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.3570282459259033, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.004071301314979792, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.013678374700248241, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00024563088663853705, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.601470947265625, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.002035983605310321, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 25.60074806213379, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.0567190907895565, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.13294792175293, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.05034729093313217, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.615615844726562, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0019135575275868177, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 10.453218460083008, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.002424027770757675, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 10.444467544555664, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.002523045288398862, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.466025352478027, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.051584143191576004, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.491785049438477, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.06395812332630157, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.3542327880859375, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0030450790654867887, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.011473423801362514, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00015824215370230377, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.592199325561523, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0020691128447651863, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 25.604280471801758, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05284876748919487, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.095787048339844, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.047348182648420334, "pnorm/_forward_module.model.norm.weight": 22.82024383544922, "gnorm/_forward_module.model.norm.weight": 0.02021314948797226, "pnorm/_forward_module.lm_head.weight": 109.1305160522461, "gnorm/_forward_module.lm_head.weight": 0.613010585308075} +{"step": 62914560, "pnorm/_forward_module.model.embeddings.weight": 104.88796997070312, "gnorm/_forward_module.model.embeddings.weight": 0.1025189757347107, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.719860076904297, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004939389415085316, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 11.128229141235352, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011005526408553123, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 11.163755416870117, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008986598812043667, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.692118644714355, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10085190087556839, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.692058563232422, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10163657367229462, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.4433810710906982, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008346364833414555, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.05477088689804077, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001424093614332378, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.73202896118164, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0054967645555734634, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.1516056060791, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.08966707438230515, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 18.433656692504883, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08510471880435944, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.62489128112793, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.003507278859615326, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.60677719116211, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005097005981951952, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 10.63875961303711, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005461358465254307, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.552125930786133, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06709662079811096, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.560407638549805, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06704068183898926, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.4975993633270264, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004895268008112907, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.012294040992856026, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00029022671515122056, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.661087036132812, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.002694769762456417, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 25.98029136657715, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05590150132775307, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.379966735839844, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06475444883108139, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.602642059326172, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0020425161346793175, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 10.660996437072754, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.0035288354847580194, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 10.6280517578125, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.003947612829506397, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.521777153015137, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.041523296386003494, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.5033540725708, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03891300782561302, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.4482871294021606, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0038307325448840857, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.01765078864991665, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00025077848113141954, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.624366760253906, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001541845384053886, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 25.910226821899414, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.038379643112421036, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.356290817260742, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04128168150782585, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.592409133911133, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001665607444010675, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 10.610779762268066, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0038814737927168608, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 10.570647239685059, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.003478498198091984, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.495216369628906, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03607863560318947, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.514289855957031, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.030424360185861588, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.4417190551757812, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004751028958708048, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.017058830708265305, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002672166156116873, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.600055694580078, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0017562283901497722, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 25.862957000732422, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03899180144071579, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.342039108276367, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03161240369081497, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.587947845458984, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0015075609553605318, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 10.5874662399292, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.0044016242027282715, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 10.546067237854004, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.004575047176331282, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.500486373901367, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.035919733345508575, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.536487579345703, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.028524480760097504, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.41974937915802, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.006768654100596905, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.02287006936967373, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0004476241010706872, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.57424545288086, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0027162842452526093, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 25.807453155517578, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.05025612935423851, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.28801155090332, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03361076861619949, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.585412979125977, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0015436011599376798, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 10.561579704284668, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006316781509667635, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 10.555158615112305, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006311678793281317, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.514095306396484, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04088365659117699, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.548105239868164, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03263513371348381, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.3972113132476807, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005797999911010265, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.015622096136212349, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0002732254797592759, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.54882049560547, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.003141095396131277, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 25.795900344848633, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05698899179697037, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.22639274597168, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03376070782542229, "pnorm/_forward_module.model.norm.weight": 22.894630432128906, "gnorm/_forward_module.model.norm.weight": 0.01964632235467434, "pnorm/_forward_module.lm_head.weight": 115.21121978759766, "gnorm/_forward_module.lm_head.weight": 0.31669971346855164} +{"step": 83886080, "pnorm/_forward_module.model.embeddings.weight": 106.83600616455078, "gnorm/_forward_module.model.embeddings.weight": 0.1316344439983368, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.775516510009766, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0033321096561849117, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 11.481205940246582, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.015032563358545303, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 11.5246000289917, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.010887769982218742, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.847792625427246, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08262234181165695, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.841447830200195, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08642872422933578, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.4939213991165161, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.011053667403757572, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0642610415816307, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006266768323257565, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.760208129882812, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0024449783377349377, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.4133358001709, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0635531097650528, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 18.60830307006836, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05493597686290741, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.642797470092773, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0019923350773751736, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.704547882080078, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.014762197621166706, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 10.74113655090332, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.015568328090012074, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.639788627624512, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03926309943199158, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.645692825317383, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.041322268545627594, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.5617039203643799, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01914738118648529, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.013953110203146935, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008981976425275207, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.65042495727539, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0012231635628268123, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.139623641967773, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.032895758748054504, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.50960350036621, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03192935511469841, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.60746192932129, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0009000562131404877, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 10.793282508850098, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.005929093342274427, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 10.75982666015625, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.005741813685745001, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.574904441833496, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.021997131407260895, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.562763214111328, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.022791419178247452, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.5088341236114502, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.005339370109140873, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.02315177395939827, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0003472678945399821, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.603839874267578, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.00073584308847785, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.047420501708984, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.020620718598365784, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.472381591796875, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.017804833129048347, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.586973190307617, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0006952830590307713, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 10.73270320892334, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0026739079039543867, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 10.690000534057617, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.002474699169397354, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.533257484436035, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.019105004146695137, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.56136417388916, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.017869656905531883, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.4938703775405884, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.00265257665887475, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.02260523848235607, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00013999533257447183, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.5675106048584, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0006456687697209418, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 25.9818115234375, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.0169414933770895, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.438438415527344, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.01506109070032835, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.576337814331055, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0006540013127960265, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 10.707620620727539, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.004992831498384476, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 10.663607597351074, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.004952191840857267, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.53066349029541, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.017709946259856224, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.574719429016113, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.016204170882701874, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.4692668914794922, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.00508389575406909, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.027870845049619675, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00021639927581418306, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.532045364379883, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0008960103150457144, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 25.9154052734375, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.020438408479094505, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.358776092529297, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.01479102112352848, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.569896697998047, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0007873183349147439, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 10.709626197814941, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.002932377392426133, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 10.698485374450684, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.0031206784769892693, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.543062210083008, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.021326713263988495, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.58206558227539, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.017185091972351074, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.4422718286514282, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.003336995141580701, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.021667849272489548, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00016211142065003514, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.508878707885742, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012765902793034911, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 25.91353416442871, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02539779432117939, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.286029815673828, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.013882339000701904, "pnorm/_forward_module.model.norm.weight": 22.96924591064453, "gnorm/_forward_module.model.norm.weight": 0.013659648597240448, "pnorm/_forward_module.lm_head.weight": 119.22555541992188, "gnorm/_forward_module.lm_head.weight": 0.2103087306022644} +{"step": 104857600, "pnorm/_forward_module.model.embeddings.weight": 108.76527404785156, "gnorm/_forward_module.model.embeddings.weight": 0.0988059863448143, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.824087142944336, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003368658944964409, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 11.794903755187988, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.016469884663820267, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 11.851043701171875, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.014318790286779404, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.964425086975098, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.07618066668510437, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.950562477111816, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08261243999004364, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.5321335792541504, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01662900671362877, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.07160966098308563, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0009615309536457062, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.778173446655273, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0035780102480202913, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.60634994506836, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06280063837766647, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 18.74416160583496, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06500120460987091, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.650901794433594, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002324324334040284, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.808197021484375, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.013642110861837864, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 10.84225082397461, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.013165615499019623, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.686192512512207, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.0460541695356369, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.694801330566406, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05384279415011406, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.6083064079284668, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01458628848195076, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.018821043893694878, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006952470866963267, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.633716583251953, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001352748367935419, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.243896484375, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03713667392730713, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.607742309570312, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04014023393392563, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.60975456237793, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0012576788431033492, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 10.933894157409668, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.006281932350248098, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 10.892952919006348, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.006226254627108574, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.602370262145996, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.030910717323422432, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.598128318786621, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.028728479519486427, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.557499885559082, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.005386714823544025, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.031153308227658272, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000558873696718365, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.582597732543945, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007571959285996854, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.146347045898438, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.024403586983680725, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.564661026000977, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.022008227184414864, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.583702087402344, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0010362648172304034, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 10.8764009475708, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0037515603471547365, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 10.826032638549805, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.004400154110044241, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.554410934448242, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03166214004158974, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.589902877807617, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.024834230542182922, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.5280916690826416, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006276309490203857, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.029609646648168564, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007125699776224792, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.543535232543945, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001113652135245502, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.077869415283203, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.02389681339263916, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.52174949645996, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.020319726318120956, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.572967529296875, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0013868431560695171, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 10.812032699584961, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.006953163538128138, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 10.773788452148438, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.007686286699026823, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.558189392089844, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.03382859751582146, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.60929012298584, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.025318192318081856, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.5158807039260864, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.006155950948596001, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.03211659938097, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0004619324463419616, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.50444793701172, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0016975083854049444, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 26.005090713500977, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.031058358028531075, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.419004440307617, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.019784197211265564, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.568954467773438, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0016793108079582453, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 10.851700782775879, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008938158862292767, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 10.831888198852539, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.007943904027342796, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.578812599182129, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0427875779569149, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.620943069458008, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03135610371828079, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.481346607208252, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.009231198579072952, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.030236247926950455, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0006436349940486252, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.48604393005371, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.003069445490837097, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.00773811340332, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05107402428984642, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.33025360107422, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.024478260427713394, "pnorm/_forward_module.model.norm.weight": 23.0690975189209, "gnorm/_forward_module.model.norm.weight": 0.011584791354835033, "pnorm/_forward_module.lm_head.weight": 122.4480972290039, "gnorm/_forward_module.lm_head.weight": 0.2163497358560562} +{"step": 125829120, "pnorm/_forward_module.model.embeddings.weight": 110.539794921875, "gnorm/_forward_module.model.embeddings.weight": 0.13281475007534027, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.868183135986328, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004224075004458427, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 12.056949615478516, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.015236386097967625, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 12.124784469604492, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.016914954409003258, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.059662818908691, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1036851555109024, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.037720680236816, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11615312099456787, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.574399471282959, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01207314245402813, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.07757940143346786, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00048717574100010097, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.784244537353516, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00437500374391675, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.73851203918457, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.09157276898622513, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 18.84303855895996, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10378007590770721, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.65324592590332, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0030961965676397085, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.907236099243164, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.010707822628319263, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 10.932843208312988, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.013345225714147091, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.707816123962402, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08005566895008087, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.72025203704834, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09847377985715866, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.6529532670974731, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008171234279870987, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.02568211406469345, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006001560832373798, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.613595962524414, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001832918031141162, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.313701629638672, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.060162253677845, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.677331924438477, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06206365302205086, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.610483169555664, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0025177819188684225, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.062769889831543, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012116669677197933, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.016406059265137, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01594688557088375, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.61920166015625, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.058585118502378464, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.623127937316895, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05276278406381607, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.593552827835083, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.01483570970594883, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.040804117918014526, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0012305979616940022, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.567291259765625, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0012905660551041365, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.232860565185547, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04012874513864517, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.639114379882812, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03616517782211304, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.585166931152344, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001965865259990096, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.015167236328125, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00862634927034378, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 10.9642915725708, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.011770686134696007, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.575623512268066, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.057836540043354034, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.618704795837402, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.043824005872011185, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.5611770153045654, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.008931995369493961, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.03629131615161896, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007217960665002465, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.529340744018555, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0018260062206536531, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.176589965820312, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03611484915018082, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.60235595703125, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03170987591147423, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.574865341186523, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0022885026410222054, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 10.925511360168457, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.012125948444008827, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 10.89113998413086, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.014009195379912853, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.583314895629883, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.05077369883656502, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.64307975769043, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04018859937787056, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.5575289726257324, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.010789363645017147, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.03770965337753296, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.000804465205874294, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.493160247802734, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0022306276950985193, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 26.110389709472656, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04047847539186478, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.491744995117188, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.029377514496445656, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.578500747680664, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.003157768864184618, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 10.98249340057373, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.017340989783406258, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 10.953262329101562, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01773030497133732, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.623862266540527, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0705447718501091, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.671080589294434, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.04896058142185211, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.5343643426895142, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.026394348591566086, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.03853106498718262, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0023084457498043776, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.475759506225586, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.005168651230633259, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.098526000976562, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.07813906669616699, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.379335403442383, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.038071539252996445, "pnorm/_forward_module.model.norm.weight": 23.194360733032227, "gnorm/_forward_module.model.norm.weight": 0.011216702871024609, "pnorm/_forward_module.lm_head.weight": 125.59259033203125, "gnorm/_forward_module.lm_head.weight": 0.23988300561904907} +{"step": 146800640, "pnorm/_forward_module.model.embeddings.weight": 112.16051483154297, "gnorm/_forward_module.model.embeddings.weight": 0.17487072944641113, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.905677795410156, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0045369332656264305, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 12.260844230651855, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.04342787340283394, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 12.33402156829834, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.0445626825094223, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.13553237915039, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10954815149307251, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.106432914733887, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12415492534637451, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.620123028755188, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.04885854199528694, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.08336569368839264, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0032211076468229294, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.781539916992188, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003704165341332555, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.826221466064453, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0784304141998291, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 18.91067123413086, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07833585143089294, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.65464973449707, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0033845240250229836, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 10.998577117919922, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03461149334907532, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.013505935668945, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.027810871601104736, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.720010757446289, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06799273192882538, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.736266136169434, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07398910075426102, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.695178508758545, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.030899381265044212, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.03349019214510918, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0013029540423303843, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.594846725463867, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0016050589038059115, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.369813919067383, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04369986429810524, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.725732803344727, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04256639629602432, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.615812301635742, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0020030729938298464, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.166619300842285, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011723566800355911, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.11417293548584, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.016622940078377724, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.642720222473145, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.046804070472717285, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.653688430786133, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04402254521846771, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.6359471082687378, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.016323016956448555, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.050704002380371094, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0014328381512314081, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.553266525268555, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0009945271303877234, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.309879302978516, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03273516520857811, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.696025848388672, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.026959910988807678, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.59424591064453, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0013484113151207566, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.132234573364258, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00568043626844883, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.073589324951172, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.007774591911584139, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.60438346862793, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.041327282786369324, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.655449867248535, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03833989053964615, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.6075273752212524, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.008202757686376572, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.043013233691453934, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0009167056996375322, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.521343231201172, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0009043567697517574, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.277976989746094, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.028673401102423668, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.67426109313965, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.025584401562809944, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.58023452758789, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0016388295916840434, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.02031421661377, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.009991725906729698, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 10.985873222351074, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.01203190442174673, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.611977577209473, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.03942820429801941, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.681294441223145, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.035549141466617584, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.604661226272583, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.01585080660879612, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.043270133435726166, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0016493391012772918, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.492408752441406, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0016559226205572486, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 26.232797622680664, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.036421943455934525, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.57244873046875, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.026001645252108574, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.581708908081055, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.00200664927251637, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.077229499816895, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.020689914003014565, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.040599822998047, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.019657179713249207, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.658724784851074, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.050644248723983765, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.714216232299805, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03804061934351921, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.6003793478012085, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.011447093449532986, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.04623497277498245, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0009182156063616276, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.4737606048584, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0047119236551225185, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.20450782775879, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.0780734196305275, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.44142723083496, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03543779253959656, "pnorm/_forward_module.model.norm.weight": 23.335927963256836, "gnorm/_forward_module.model.norm.weight": 0.01244575809687376, "pnorm/_forward_module.lm_head.weight": 128.84165954589844, "gnorm/_forward_module.lm_head.weight": 0.19579890370368958} +{"step": 167772160, "pnorm/_forward_module.model.embeddings.weight": 113.65425109863281, "gnorm/_forward_module.model.embeddings.weight": 0.3752332925796509, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.937597274780273, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.013263835571706295, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 12.430756568908691, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.0384797640144825, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 12.5009126663208, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.037632327526807785, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.196552276611328, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.314021497964859, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.160433769226074, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.36097291111946106, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.6697561740875244, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.043641384690999985, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.08951397985219955, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.003036167938262224, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.777305603027344, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.008808135986328125, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.896268844604492, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.26096218824386597, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 18.962692260742188, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.3214764893054962, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.65239906311035, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.009959651157259941, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.064876556396484, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.040866605937480927, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.075906753540039, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.05842449143528938, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.722832679748535, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.24192827939987183, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.743001937866211, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.2419004589319229, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.7475247383117676, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.03196559473872185, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.04115737974643707, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0023183522280305624, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.579782485961914, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.004759897943586111, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.422643661499023, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.1375977098941803, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.764278411865234, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.1492980420589447, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.616697311401367, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.006893648765981197, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.23178768157959, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.027214152738451958, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.176909446716309, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.04267067834734917, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.666264533996582, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.18305528163909912, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.683701515197754, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.13463473320007324, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.6933085918426514, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.022443555295467377, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.05853838473558426, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0020074823405593634, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.547780990600586, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0029985280707478523, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.39693260192871, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.09887542575597763, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.75298309326172, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.10017292201519012, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.599571228027344, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.007837506011128426, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.219182968139648, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.03124256059527397, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.157421112060547, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.0766039788722992, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.633135795593262, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.16620101034641266, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.690659523010254, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.11244045197963715, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.6677882671356201, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.030966544523835182, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.04971807450056076, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0020819769706577063, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.527803421020508, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0023732015397399664, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.403688430786133, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08238833397626877, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.754724502563477, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.08284381777048111, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.592710494995117, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.005242627114057541, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.11439037322998, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.035145070403814316, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.07318115234375, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03883512318134308, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.648566246032715, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.1228802427649498, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.72793960571289, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.0995510146021843, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.6642427444458008, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.02441808395087719, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.05036652833223343, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002336392644792795, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.501981735229492, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0025580525398254395, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 26.378047943115234, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.07401462644338608, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.664852142333984, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.07434221357107162, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.585594177246094, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0068656872026622295, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.165380477905273, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.06707088649272919, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.114307403564453, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.08433990180492401, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.684772491455078, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.14204658567905426, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.751830101013184, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.11026781797409058, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.6728826761245728, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.03933202847838402, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.054224275052547455, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0036527039483189583, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.475038528442383, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.00513248099014163, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.326316833496094, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.09200069308280945, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.51121711730957, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.06650051474571228, "pnorm/_forward_module.model.norm.weight": 23.490999221801758, "gnorm/_forward_module.model.norm.weight": 0.01681654341518879, "pnorm/_forward_module.lm_head.weight": 132.37454223632812, "gnorm/_forward_module.lm_head.weight": 0.23402036726474762} +{"step": 188743680, "pnorm/_forward_module.model.embeddings.weight": 115.03633117675781, "gnorm/_forward_module.model.embeddings.weight": 0.08376393467187881, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.970783233642578, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0032585039734840393, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 12.574190139770508, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.014919369481503963, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 12.638427734375, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.015795374289155006, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.263532638549805, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08675462752580643, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.22181510925293, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.09267345815896988, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.7154415845870972, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.015888985246419907, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.09650439769029617, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0016836163122206926, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.77973747253418, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0029643329326063395, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.974180221557617, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06862007826566696, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.011850357055664, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07340862601995468, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.65519905090332, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0021133432164788246, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.122611999511719, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.01301821693778038, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.132399559020996, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.014455176889896393, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.739371299743652, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.0519380122423172, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.760787010192871, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05444948747754097, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.798226237297058, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008024908602237701, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.04852423444390297, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009567710221745074, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.560087203979492, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0012348982272669673, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.464757919311523, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03433221951127052, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.791027069091797, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03334961086511612, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.62276268005371, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0014553562505170703, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.288455963134766, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007076281122863293, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.234171867370605, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.010397701524198055, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.698945045471191, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.036375608295202255, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.72293472290039, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03278509899973869, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.7506951093673706, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007318103685975075, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.06700449436903, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006827554316259921, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.52984046936035, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0008589018252678216, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.463544845581055, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.026315895840525627, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.791440963745117, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.02174973674118519, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.601211547851562, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0010722638107836246, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.289315223693848, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005969169083982706, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.22476863861084, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008934075944125652, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.66433334350586, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.02896047569811344, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.7267427444458, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.025402246043086052, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.7188732624053955, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006227957550436258, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.055233605206012726, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005018361262045801, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.5206356048584, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.000763125135563314, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.506664276123047, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.023599296808242798, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.815479278564453, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.019996507093310356, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.601593017578125, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0009989457903429866, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.19720458984375, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.008404534310102463, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.150843620300293, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.01021958701312542, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.686054229736328, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.026951512321829796, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.7737455368042, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02374197356402874, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.7205321788787842, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.009674391709268093, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.057904232293367386, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.001013408531434834, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.506567001342773, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001000195275992155, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 26.515380859375, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.0251039769500494, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.744518280029297, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.021046068519353867, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.578516006469727, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0012637685285881162, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.242677688598633, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.012781188823282719, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.179377555847168, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01338895969092846, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.701088905334473, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.029667265713214874, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.778005599975586, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.02660593017935753, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.7192858457565308, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.010645592585206032, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.05948524922132492, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0008564431918784976, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.475412368774414, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0028047822415828705, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.44715690612793, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04446825385093689, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.57662582397461, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.023175247013568878, "pnorm/_forward_module.model.norm.weight": 23.63797378540039, "gnorm/_forward_module.model.norm.weight": 0.01328919269144535, "pnorm/_forward_module.lm_head.weight": 135.87295532226562, "gnorm/_forward_module.lm_head.weight": 0.15062718093395233} +{"step": 209715200, "pnorm/_forward_module.model.embeddings.weight": 116.34912109375, "gnorm/_forward_module.model.embeddings.weight": 0.13371901214122772, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.99783706665039, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005790181457996368, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 12.712936401367188, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01744506135582924, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 12.767948150634766, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.021696941927075386, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.312206268310547, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13804848492145538, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.266064643859863, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.149854376912117, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.7596806287765503, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.018873820081353188, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1041477620601654, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002403165912255645, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.772520065307617, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004605800844728947, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.026714324951172, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1259504109621048, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.047922134399414, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.16037921607494354, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.65134048461914, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.005060211755335331, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.162675857543945, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.026778852567076683, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.173456192016602, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.044322166591882706, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.739363670349121, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1287258267402649, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.763998031616211, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11466874182224274, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.8608654737472534, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00974093098193407, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0569148063659668, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003519757592584938, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.543174743652344, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001980964094400406, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.507627487182617, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05981297791004181, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.81755256652832, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.06528261303901672, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.620853424072266, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0040210853330791, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.31894302368164, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.015362744219601154, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.26956844329834, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.023686038330197334, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.718331336975098, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.09621306508779526, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.749685287475586, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06048718839883804, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.816667079925537, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.009321732446551323, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.07516387104988098, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008463766425848007, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.52362632751465, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0015609717229381204, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.54401969909668, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04419491067528725, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.837007522583008, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04129757359623909, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.604591369628906, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.003032674780115485, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.348023414611816, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.014145085588097572, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.279391288757324, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.026296906173229218, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.693532943725586, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.08278848975896835, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.763762474060059, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04668000340461731, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.7839628458023071, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.007528932765126228, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.06183319166302681, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005908478633500636, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.530227661132812, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0016129405703395605, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.634843826293945, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.039430033415555954, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.891010284423828, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.035680368542671204, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.621212005615234, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0024913137312978506, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.283201217651367, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.012109853327274323, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.232443809509277, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.016965115442872047, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.731470108032227, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06364411115646362, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.829790115356445, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04144534096121788, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.7886402606964111, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.008455894887447357, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0660754069685936, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0006458215648308396, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.5244083404541, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0018280846998095512, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 26.682645797729492, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03800693526864052, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.84081268310547, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03393349424004555, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.584501266479492, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0032346206717193127, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.316814422607422, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.021123290061950684, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.244068145751953, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.03018214926123619, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.72836971282959, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0732160359621048, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.817883491516113, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.042871225625276566, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.7883265018463135, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.014478277415037155, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.06651607900857925, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0014598111156374216, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.482778549194336, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.003975541330873966, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.593006134033203, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.059744756668806076, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.657867431640625, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03441070020198822, "pnorm/_forward_module.model.norm.weight": 23.80154800415039, "gnorm/_forward_module.model.norm.weight": 0.0160821545869112, "pnorm/_forward_module.lm_head.weight": 139.5150909423828, "gnorm/_forward_module.lm_head.weight": 0.15685692429542542} +{"step": 230686720, "pnorm/_forward_module.model.embeddings.weight": 117.59530639648438, "gnorm/_forward_module.model.embeddings.weight": 0.19049403071403503, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.01949119567871, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.008033240213990211, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 12.831002235412598, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.017128897830843925, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 12.8766450881958, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.019032102078199387, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.348626136779785, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.20309196412563324, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.299198150634766, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.21811527013778687, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.7971235513687134, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.013582793064415455, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.11142747104167938, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0012898566201329231, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.762617111206055, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.005687080789357424, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.064123153686523, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.1755734533071518, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.074607849121094, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.23553292453289032, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.64662742614746, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.008592969737946987, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.206832885742188, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.03197294473648071, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.2162504196167, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.06188829988241196, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.734309196472168, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1994755119085312, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.76162052154541, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.18193203210830688, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.913301706314087, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01388774998486042, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0636541023850441, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006036779377609491, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.534732818603516, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0025495989248156548, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.55596351623535, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.09598905593156815, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.845869064331055, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.10054946690797806, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.617097854614258, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.005318331066519022, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.345049858093262, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.023664621636271477, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.298416137695312, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.03715529292821884, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.732030868530273, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.15770231187343597, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.771069526672363, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.10204190760850906, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.881794810295105, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.013457235880196095, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.08174212276935577, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008635143167339265, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.527463912963867, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.002073934068903327, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.63053321838379, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.0732022374868393, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.88658332824707, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.07394726574420929, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.61311912536621, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0049414727836847305, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.419389724731445, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.02139521948993206, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.339095115661621, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.03744104132056236, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.720309257507324, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.14235864579677582, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.799025535583496, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.07976144552230835, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.850890874862671, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.011397928930819035, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.06791403144598007, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0009546764777041972, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.546815872192383, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002513659419491887, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.769325256347656, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.0722820833325386, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 18.969547271728516, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06860664486885071, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.643346786499023, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.00604212936013937, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.371956825256348, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.026979347690939903, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.309248924255371, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.043459463864564896, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.773518562316895, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.14914603531360626, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.884276390075684, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.0711166113615036, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.8562583923339844, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.013241935521364212, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.07341331988573074, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0008141965372487903, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.547651290893555, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0031956692691892385, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 26.855274200439453, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.06870071589946747, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 18.939620971679688, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.062133338302373886, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.596630096435547, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0060209245420992374, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.39108943939209, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.0404476672410965, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.311890602111816, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.054257411509752274, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.751691818237305, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.12477506697177887, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.854666709899902, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.06664307415485382, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.8716579675674438, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.021433386951684952, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.07459148019552231, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0022607718128710985, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.49197769165039, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.004628044553101063, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.73834800720215, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.08054588735103607, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.737686157226562, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.05005943775177002, "pnorm/_forward_module.model.norm.weight": 23.97566032409668, "gnorm/_forward_module.model.norm.weight": 0.012181926518678665, "pnorm/_forward_module.lm_head.weight": 143.2727813720703, "gnorm/_forward_module.lm_head.weight": 0.24428319931030273} +{"step": 251658240, "pnorm/_forward_module.model.embeddings.weight": 118.77630615234375, "gnorm/_forward_module.model.embeddings.weight": 0.09104131907224655, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.043256759643555, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003967117518186569, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 12.93606185913086, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.016746236011385918, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 12.97180461883545, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.017688849940896034, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.389973640441895, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10109597444534302, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.337420463562012, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10743393748998642, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.8331917524337769, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.018047207966446877, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1185770109295845, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0009192582801915705, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.757070541381836, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0023324466310441494, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.108543395996094, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06591136753559113, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.101581573486328, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07353788614273071, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.648448944091797, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0027899721171706915, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.257966041564941, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.012535429559648037, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.26070785522461, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.017693717032670975, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.737060546875, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06864340603351593, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.765826225280762, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0697237104177475, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 1.9694334268569946, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008365937508642673, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.07080504298210144, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006330774631351233, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.524751663208008, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0016419473104178905, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.600032806396484, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04205929487943649, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.870288848876953, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.035878922790288925, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.621288299560547, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0017433192115277052, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.387290000915527, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009558811783790588, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.3395414352417, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.013158700428903103, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.749876022338867, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.04824104160070419, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.794900894165039, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.043127186596393585, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 1.9576321840286255, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.00603956775739789, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.08955115079879761, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0005996979307383299, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.52739906311035, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0011436814675107598, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.706390380859375, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03283266723155975, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.929319381713867, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.025952156633138657, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.62242317199707, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0015175408916547894, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.487507820129395, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008609105832874775, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.397679328918457, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.011436790227890015, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.744427680969238, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.041046444326639175, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.830910682678223, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03392907977104187, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.9233272075653076, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006664702668786049, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0749426931142807, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005981608992442489, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.56074333190918, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0010038301115855575, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 26.892406463623047, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03043271228671074, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.039464950561523, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.024798719212412834, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.662601470947266, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0013744536554440856, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.459198951721191, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.011094560846686363, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.382780075073242, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.013025806285440922, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.809196472167969, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.0368102490901947, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.930768966674805, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03257548809051514, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.919845461845398, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.007494404446333647, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0807492583990097, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.000537809741217643, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.567773818969727, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001170991687104106, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 27.014596939086914, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.031339697539806366, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.030467987060547, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.027022453024983406, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.597213745117188, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001864320831373334, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.46182918548584, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.013967448845505714, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.377432823181152, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.016739584505558014, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.763339042663574, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.042556922882795334, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.87402629852295, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.029189875349402428, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 1.947001338005066, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005487373564392328, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0821760818362236, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.000577339029405266, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.502880096435547, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0028949470724910498, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 26.88933563232422, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.048810314387083054, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.822097778320312, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02678196132183075, "pnorm/_forward_module.model.norm.weight": 24.14000129699707, "gnorm/_forward_module.model.norm.weight": 0.014620570465922356, "pnorm/_forward_module.lm_head.weight": 146.97328186035156, "gnorm/_forward_module.lm_head.weight": 0.11145640164613724} +{"step": 272629760, "pnorm/_forward_module.model.embeddings.weight": 119.91284942626953, "gnorm/_forward_module.model.embeddings.weight": 0.10889001935720444, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.057653427124023, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004217384848743677, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.033028602600098, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013483759947121143, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.059002876281738, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.017534516751766205, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.412915229797363, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13076737523078918, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.358333587646484, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.13625681400299072, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.8598153591156006, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.013878464698791504, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1263749897480011, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0015820229891687632, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.743017196655273, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004496718756854534, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.132381439208984, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.09118454903364182, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.119535446166992, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.11226189881563187, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.647310256958008, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004121502861380577, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.306846618652344, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.01542238611727953, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.302882194519043, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.02447574958205223, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.731199264526367, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.10719011723995209, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.762475967407227, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09258536994457245, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.022031545639038, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010991967283189297, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.07788518816232681, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0004613384953700006, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.514833450317383, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0018790309550240636, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.643281936645508, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.050080325454473495, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.893898010253906, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.044430848211050034, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.62422752380371, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0022747351322323084, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.427118301391602, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010895579122006893, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.376411437988281, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01489811297506094, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.763297080993652, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.0740971788764, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.816730499267578, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04862150549888611, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.030853509902954, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.006349043920636177, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.09594131261110306, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006057805730961263, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.53068733215332, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001366862328723073, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.78592872619629, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04131748899817467, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 18.973661422729492, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.033244188874959946, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.636764526367188, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0019422370241954923, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.560235023498535, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.010788674466311932, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.458044052124023, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.014126082882285118, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.770689010620117, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06462884694337845, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.868244171142578, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.0433826819062233, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 1.9914355278015137, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.007485238369554281, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.08127256482839584, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006745359860360622, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.577972412109375, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0015585002256557345, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.02077865600586, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.041612815111875534, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.112918853759766, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.032181914895772934, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.69171905517578, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0024419575929641724, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.555557250976562, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02050498127937317, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.464851379394531, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.02218904159963131, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.852001190185547, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06302734464406967, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 10.989665031433105, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.039792250841856, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 1.973231554031372, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.01414131373167038, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.08629895746707916, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0014770098496228456, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.59722328186035, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0017882657703012228, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 27.189908981323242, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04029306769371033, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.131345748901367, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.0316234789788723, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.60917854309082, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002278014784678817, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.54100513458252, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.021365994587540627, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.449332237243652, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.028771869838237762, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.786969184875488, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.056546103209257126, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.91482162475586, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03543837368488312, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.005755662918091, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.010855098254978657, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.08846894651651382, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001047951402142644, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.513490676879883, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.002303200075402856, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 27.039731979370117, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.043763983994722366, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.904651641845703, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.027595791965723038, "pnorm/_forward_module.model.norm.weight": 24.317291259765625, "gnorm/_forward_module.model.norm.weight": 0.01278015412390232, "pnorm/_forward_module.lm_head.weight": 150.7801513671875, "gnorm/_forward_module.lm_head.weight": 0.10523701459169388} +{"step": 293601280, "pnorm/_forward_module.model.embeddings.weight": 120.99600982666016, "gnorm/_forward_module.model.embeddings.weight": 0.11147966235876083, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.070520401000977, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005462083965539932, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.113507270812988, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013726666569709778, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.130859375, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.017138920724391937, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.43447494506836, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13642817735671997, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.37843132019043, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.14352861046791077, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.8813782930374146, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01240807119756937, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.13357405364513397, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.002736865309998393, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.732301712036133, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0028684858698397875, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.15867805480957, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.09181730449199677, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.137418746948242, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.11017195135354996, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.647695541381836, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.004199353978037834, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.361574172973633, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.018798930570483208, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.348026275634766, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.030113019049167633, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.726417541503906, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.10414307564496994, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.759902954101562, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.1031198501586914, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.057053804397583, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009412898682057858, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.08254634588956833, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008818696951493621, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.507240295410156, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001957631204277277, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.685997009277344, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06048803776502609, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.91669273376465, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.05287410318851471, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.63071060180664, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.002713034860789776, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.480287551879883, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.013688579201698303, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.4203519821167, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.024272989481687546, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.774970054626465, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.0804024413228035, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.835330963134766, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06201629713177681, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.095649242401123, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.006920207291841507, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.10218387842178345, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0005676428554579616, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.537717819213867, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0013855933211743832, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.866369247436523, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.046325843781232834, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.019298553466797, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03845065459609032, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.6495361328125, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002471981104463339, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.630498886108398, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.013287698850035667, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.515692710876465, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.02102171629667282, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.791457176208496, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.0728471428155899, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.89852237701416, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.0533377043902874, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.052717924118042, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006540372502058744, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.08681830018758774, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005456437938846648, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.596288681030273, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0013716727262362838, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.14311981201172, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.047228723764419556, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.18341827392578, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03883901238441467, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.719341278076172, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0030395379289984703, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.653237342834473, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02125188335776329, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.547159194946289, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.032916925847530365, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.885393142700195, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07673543691635132, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.037700653076172, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.05071266368031502, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.0186901092529297, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.014704009518027306, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0911145731806755, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0016136798076331615, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.623960494995117, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0017263386398553848, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 27.35148048400879, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.05246324837207794, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.22401237487793, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.04170777648687363, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.620813369750977, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002819473622366786, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.621431350708008, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.025993449613451958, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.522273063659668, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.036378324031829834, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.805458068847656, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06723666936159134, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.94849681854248, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.04792013764381409, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.0576281547546387, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.015223890542984009, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.094670869410038, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0016411906108260155, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.52789306640625, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.004479128867387772, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 27.193988800048828, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.07742790877819061, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 18.993030548095703, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.04462972283363342, "pnorm/_forward_module.model.norm.weight": 24.4943790435791, "gnorm/_forward_module.model.norm.weight": 0.015598480589687824, "pnorm/_forward_module.lm_head.weight": 154.57345581054688, "gnorm/_forward_module.lm_head.weight": 0.1563960164785385} +{"step": 314572800, "pnorm/_forward_module.model.embeddings.weight": 122.03596496582031, "gnorm/_forward_module.model.embeddings.weight": 0.09771708399057388, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.07990837097168, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004299760330468416, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.18920612335205, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.012488343752920628, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.19742488861084, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.015127616003155708, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.449588775634766, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10629958659410477, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.392823219299316, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11635883897542953, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9008351564407349, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.010763601399958134, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1404450237751007, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007686673779971898, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.720945358276367, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0023362282663583755, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.183303833007812, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.07568563520908356, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.154064178466797, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0858607292175293, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.647563934326172, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0033518148120492697, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.407238960266113, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.013268765062093735, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.385930061340332, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.021018048748373985, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.720319747924805, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08972633630037308, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.756085395812988, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07212679088115692, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.0994181632995605, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007395288906991482, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.08930277824401855, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005197248538024724, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.501869201660156, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001387468772009015, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.732139587402344, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04439787566661835, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.94086456298828, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03961736336350441, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.63694953918457, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0021608914248645306, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.536081314086914, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010279212146997452, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.46448802947998, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.015968598425388336, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.783308029174805, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.060639046132564545, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.850088119506836, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.043926484882831573, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.1546823978424072, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007969115860760212, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.10765751451253891, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0005742908688262105, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.543148040771484, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001024339348077774, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 26.942567825317383, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.036548733711242676, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.063627243041992, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.02992447279393673, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.66213607788086, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0017592781223356724, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.69373893737793, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.009843999519944191, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.566996574401855, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.013545127585530281, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.809877395629883, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05416324734687805, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.925602912902832, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.036741457879543304, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.1151516437530518, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.009200768545269966, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.09237707406282425, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0009994391584768891, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.616409301757812, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0009982774499803782, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.26201629638672, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03482495993375778, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.252370834350586, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.028577987104654312, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.758689880371094, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0020569968037307262, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.756866455078125, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.01428201049566269, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.628239631652832, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.026935212314128876, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.923151016235352, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.04641883820295334, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.093290328979492, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03354502469301224, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.07488751411438, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.010041143745183945, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.09710695594549179, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0012082296889275312, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.656204223632812, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0010768487118184566, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 27.51453971862793, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.032654475420713425, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.318321228027344, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02869519218802452, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.636314392089844, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0020829124841839075, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.702534675598145, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.01811295561492443, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.593524932861328, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.025772061198949814, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.822979927062988, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04568447917699814, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 10.98041820526123, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.029198022559285164, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.1227545738220215, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.00972615834325552, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.10187438875436783, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0010930602438747883, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.540971755981445, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0022629289887845516, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 27.341197967529297, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04361443594098091, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.073518753051758, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.027927054092288017, "pnorm/_forward_module.model.norm.weight": 24.668575286865234, "gnorm/_forward_module.model.norm.weight": 0.011973658576607704, "pnorm/_forward_module.lm_head.weight": 158.26121520996094, "gnorm/_forward_module.lm_head.weight": 0.11205610632896423} +{"step": 335544320, "pnorm/_forward_module.model.embeddings.weight": 123.0317153930664, "gnorm/_forward_module.model.embeddings.weight": 0.11661148816347122, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.08358383178711, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004108330700546503, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.253385543823242, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.014832854270935059, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.25259017944336, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01756509765982628, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.45580005645752, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11946862190961838, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.398685455322266, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.13505417108535767, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9144059419631958, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.018491661176085472, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.14765243232250214, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0011536992387846112, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.706457138061523, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0024992553517222404, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.198484420776367, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.08869421482086182, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.166118621826172, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10052657127380371, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.647356033325195, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0034114900045096874, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.45528793334961, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.015373595058918, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.424327850341797, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.021816883236169815, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.713380813598633, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1040230467915535, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.752172470092773, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09149830043315887, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1300063133239746, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012305195443332195, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.09466448426246643, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008743847720324993, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.499984741210938, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013437627349048853, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.78034210205078, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.053050097078084946, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.968019485473633, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04848535358905792, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.643774032592773, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0030816011130809784, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.590971946716309, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.013264995068311691, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.507065773010254, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.02112061157822609, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.791428565979004, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.08815094828605652, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.86434555053711, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.057391565293073654, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.204148054122925, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.009049582295119762, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.11255798488855362, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00043346991878934205, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.551950454711914, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0014432207681238651, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.02067756652832, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.0458691343665123, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.109678268432617, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03791375458240509, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.673358917236328, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0025880825705826283, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.752862930297852, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.012410185299813747, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.61631965637207, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.019679062068462372, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.826204299926758, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.0764010101556778, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.950239181518555, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04693560674786568, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.1684327125549316, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.010842060670256615, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.09722232818603516, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008347769035026431, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.63558578491211, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0014482917031273246, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.37534523010254, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.046487435698509216, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.31822395324707, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03788452968001366, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.804636001586914, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0029504697304219007, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.864716529846191, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.020435545593500137, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.709628105163574, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03519668057560921, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.960811614990234, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07568573206663132, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.14995288848877, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04125481843948364, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.124318838119507, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.012867894023656845, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.10214037448167801, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0015190762933343649, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.690589904785156, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0018807696178555489, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 27.672582626342773, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04699908569455147, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.410154342651367, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03901050612330437, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.652408599853516, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0029035035986453295, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.785618782043457, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.020871706306934357, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.664782524108887, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.025180019438266754, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.839564323425293, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06871379911899567, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.010252952575684, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03877045586705208, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.177987575531006, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.013871962204575539, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.10800457000732422, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0015760917449370027, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.55708885192871, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0027401261031627655, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 27.49110984802246, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05128858610987663, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.157978057861328, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03378387168049812, "pnorm/_forward_module.model.norm.weight": 24.843355178833008, "gnorm/_forward_module.model.norm.weight": 0.012321566231548786, "pnorm/_forward_module.lm_head.weight": 161.8473358154297, "gnorm/_forward_module.lm_head.weight": 0.12996439635753632} +{"step": 356515840, "pnorm/_forward_module.model.embeddings.weight": 123.98330688476562, "gnorm/_forward_module.model.embeddings.weight": 0.10730889439582825, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.08510398864746, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004508263431489468, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.31334400177002, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.014395227655768394, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.304265022277832, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.017157213762402534, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.458649635314941, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12883594632148743, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.401488304138184, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.14043456315994263, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9236663579940796, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.011593151837587357, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.15469171106815338, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007897707400843501, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.691715240478516, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0031417664140462875, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.211524963378906, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.08846230804920197, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.176698684692383, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.09974571317434311, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.645889282226562, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.003687808057293296, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.498858451843262, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.012657041661441326, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.458312034606934, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0191914364695549, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.705687522888184, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09943155944347382, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.746960639953613, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08730655163526535, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.158503770828247, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00887899100780487, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.09998590499162674, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0004716853436548263, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.49564552307129, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0015040352009236813, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.823041915893555, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05234513804316521, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 18.991851806640625, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04561625048518181, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.650527954101562, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.00248261378146708, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.644381523132324, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012047269381582737, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.547119140625, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.018545400351285934, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.79857063293457, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07257091253995895, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.876603126525879, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05391809716820717, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.251948118209839, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008329235017299652, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.11731315404176712, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0005663410993292928, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.560457229614258, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001202534418553114, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.09560203552246, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04238276183605194, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.154769897460938, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.035078421235084534, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.685503005981445, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.00201427168212831, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.812843322753906, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.011553647927939892, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.665576934814453, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01501484215259552, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.84318733215332, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06301015615463257, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 10.975397109985352, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.045525092631578445, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.2159578800201416, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.008298151195049286, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.10203133523464203, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006281682872213423, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.654605865478516, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001439082552678883, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.48304557800293, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.043323762714862823, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.3809757232666, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.035409312695264816, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.856101989746094, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.003030424239113927, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 11.975990295410156, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.022105321288108826, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.79249095916748, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03977242112159729, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 10.998800277709961, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06701275706291199, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.2079496383667, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04137362912297249, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.1760923862457275, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.015276911668479443, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.10715842992067337, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.001990947872400284, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.727018356323242, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0015610696282237768, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 27.827381134033203, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04062538221478462, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.499956130981445, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.036933645606040955, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.671510696411133, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002845051698386669, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.875776290893555, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.029735036194324493, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.742036819458008, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.035470038652420044, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.857763290405273, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06154875084757805, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.04102611541748, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03733653575181961, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.233957529067993, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.015533343888819218, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1143498420715332, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0013977715279906988, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.574983596801758, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0022534248419106007, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 27.639223098754883, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04701949656009674, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.2421875, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.032660916447639465, "pnorm/_forward_module.model.norm.weight": 25.01641082763672, "gnorm/_forward_module.model.norm.weight": 0.010202317498624325, "pnorm/_forward_module.lm_head.weight": 165.26121520996094, "gnorm/_forward_module.lm_head.weight": 0.12439104169607162} +{"step": 377487360, "pnorm/_forward_module.model.embeddings.weight": 124.89445495605469, "gnorm/_forward_module.model.embeddings.weight": 0.0783911645412445, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.081172943115234, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003394216997548938, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.362104415893555, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009574750438332558, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.345293045043945, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.010895627550780773, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.454590797424316, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10197697579860687, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.397655487060547, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10649345815181732, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9312808513641357, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006689104717224836, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1623755544424057, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007538440986536443, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.674407958984375, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0020088849123567343, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.21968650817871, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06694821268320084, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.185239791870117, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0760410875082016, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.643272399902344, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002992226742208004, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.536368370056152, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.01218761783093214, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.486722946166992, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.020178502425551414, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.69869613647461, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08252700418233871, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.742256164550781, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07019130140542984, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1754813194274902, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008499414660036564, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1046927273273468, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009231276926584542, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.49241065979004, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013545402325689793, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.86638832092285, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.040555838495492935, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.017414093017578, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.034779105335474014, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.655832290649414, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.00183225201908499, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.697392463684082, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008640754967927933, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.58662223815918, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.014242188073694706, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.803635597229004, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.058095432817935944, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.88759708404541, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04346470162272453, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.297619104385376, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.005888940766453743, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.12264548987150192, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00048669864190742373, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.569225311279297, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0010000746697187424, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.17021942138672, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03464248403906822, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.200864791870117, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.027864953503012657, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.698333740234375, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001609499566257, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.872076034545898, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.009497537277638912, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.713635444641113, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01233475748449564, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.861428260803223, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05158692225813866, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.003145217895508, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03735635429620743, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.26071834564209, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0071451980620622635, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.10706187784671783, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008418260258622468, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.671621322631836, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0009782410925254226, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.585403442382812, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.034832727164030075, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.44188117980957, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.028255803510546684, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.918428421020508, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0023360333871096373, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.095951080322266, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.017280058935284615, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.878045082092285, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03010326251387596, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.040064811706543, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.05807463452219963, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.272201538085938, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.034780412912368774, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.231570243835449, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.00816722959280014, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1126532331109047, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0007694617379456758, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.76466178894043, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0012303570983931422, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 27.978649139404297, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.035187214612960815, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.588115692138672, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.028927108272910118, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.693622589111328, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002000664360821247, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 11.964360237121582, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.014818428084254265, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.815727233886719, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.0197223499417305, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.879727363586426, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.05328282341361046, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.079030990600586, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03123931586742401, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.2867488861083984, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007295742630958557, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.12066281586885452, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0007664074073545635, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.59544563293457, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0029928458388894796, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 27.785655975341797, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.055556103587150574, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.329317092895508, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03669915348291397, "pnorm/_forward_module.model.norm.weight": 25.191997528076172, "gnorm/_forward_module.model.norm.weight": 0.013388078659772873, "pnorm/_forward_module.lm_head.weight": 168.49778747558594, "gnorm/_forward_module.lm_head.weight": 0.10367895662784576} +{"step": 398458880, "pnorm/_forward_module.model.embeddings.weight": 125.76701354980469, "gnorm/_forward_module.model.embeddings.weight": 0.10229793190956116, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.077865600585938, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004486253950744867, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.412444114685059, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011220389977097511, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.387136459350586, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012700512073934078, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.451393127441406, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13266228139400482, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.394583702087402, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.13834714889526367, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9375767707824707, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.009291688911616802, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.1703713834285736, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0005065248114988208, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.65518569946289, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002534394385293126, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.225217819213867, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.08674588799476624, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.192516326904297, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.09443842619657516, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.638912200927734, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0034320892300456762, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.569879531860352, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.014047432690858841, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.513006210327148, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.02414454147219658, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.69328784942627, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.10381689667701721, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.739217758178711, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08664577454328537, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1869494915008545, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013242070563137531, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.10920026898384094, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0014650889206677675, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.488483428955078, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013829271774739027, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.90978240966797, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0496591217815876, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.04371452331543, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04333323612809181, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.66161346435547, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0023446278646588326, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.755073547363281, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012665360234677792, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.628942489624023, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.018958792090415955, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.808263778686523, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07735806703567505, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.89801025390625, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05314937233924866, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.3372395038604736, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.00977968517690897, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1274646669626236, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007879264885559678, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.577993392944336, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0012200692435726523, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.244123458862305, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.042551565915346146, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.247478485107422, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.034735891968011856, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.710710525512695, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002170894993469119, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 11.93533706665039, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.012660070322453976, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.764045715332031, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.017846597358584404, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.88010311126709, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06823495775461197, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.031176567077637, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.044544193893671036, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.296987533569336, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.010384132154285908, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.11125534772872925, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0012625920353457332, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.685447692871094, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0012402565917000175, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.6811580657959, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04138964042067528, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.500568389892578, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03405993431806564, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 22.985971450805664, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0030886614695191383, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.221569061279297, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.018554937094449997, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 11.967947959899902, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.035285770893096924, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.081694602966309, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06854552030563354, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.33785629272461, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04105832427740097, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.281315326690674, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.009273367933928967, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.11736711114645004, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0009778881212696433, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.803176879882812, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001476565725170076, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 28.127840042114258, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03970320522785187, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.674652099609375, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.034778691828250885, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.712421417236328, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0022951026912778616, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.048412322998047, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.014200737699866295, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.88521957397461, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.024841193109750748, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.899712562561035, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.05760574713349342, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.113861083984375, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03478769212961197, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.327662706375122, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.008615827187895775, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.12528900802135468, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001023236894980073, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.614728927612305, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.001988058676943183, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 27.932491302490234, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.0440484881401062, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.411945343017578, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.034004904329776764, "pnorm/_forward_module.model.norm.weight": 25.362735748291016, "gnorm/_forward_module.model.norm.weight": 0.009785422123968601, "pnorm/_forward_module.lm_head.weight": 171.56622314453125, "gnorm/_forward_module.lm_head.weight": 0.11838195472955704} +{"step": 419430400, "pnorm/_forward_module.model.embeddings.weight": 126.6026382446289, "gnorm/_forward_module.model.embeddings.weight": 0.13300304114818573, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.070653915405273, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.006131074391305447, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.460570335388184, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.014509015716612339, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.427083969116211, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.017392786219716072, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.442947387695312, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.17182698845863342, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.386990547180176, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.17770014703273773, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9418224096298218, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.014241264201700687, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.17787866294384003, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0015122892800718546, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.631864547729492, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.004065715242177248, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.22486686706543, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.11835425347089767, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.197128295898438, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.1331779658794403, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.631771087646484, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.005674952641129494, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.599457740783691, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.018233221024274826, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.536073684692383, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.030067410320043564, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.687495231628418, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.15131744742393494, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.736363410949707, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.11323127895593643, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.192054510116577, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.015085827559232712, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.11287225782871246, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007716157706454396, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.48261260986328, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0017733098939061165, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.951976776123047, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06397969275712967, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.070005416870117, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.057329267263412476, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.667369842529297, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.003432836849242449, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.816295623779297, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.014523073099553585, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.673852920532227, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.020957399159669876, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.812700271606445, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.11097199469804764, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.90843391418457, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06701502948999405, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.3743228912353516, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010087914764881134, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.13210712373256683, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008170761866495013, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.583576202392578, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0017507713055238128, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.313764572143555, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05828706920146942, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.292850494384766, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04833988845348358, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.72382354736328, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0032901419326663017, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.003243446350098, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.015179264359176159, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.817774772644043, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.022486506029963493, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.89942741394043, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.10027962177991867, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.060054779052734, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.05650703236460686, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.3308398723602295, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.013345101848244667, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1152004599571228, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0017104634316638112, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.695497512817383, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0015815087826922536, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.77082633972168, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.056871239095926285, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.5561466217041, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04732450842857361, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.061893463134766, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.004773326218128204, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.351630210876465, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02523997239768505, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.058046340942383, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.04637295752763748, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.127816200256348, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11313251405954361, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.412273406982422, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04920203983783722, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.3231849670410156, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.009759616106748581, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.12134481966495514, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0009079553419724107, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.843002319335938, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001707739313133061, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 28.27666664123535, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04993486404418945, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.761449813842773, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.044634658843278885, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.73823356628418, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.003006188664585352, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.140711784362793, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.017246492207050323, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 11.96092700958252, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.024104928597807884, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.924047470092773, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.08378183841705322, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.156548500061035, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.039526697248220444, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.377810001373291, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.00859067589044571, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1305346041917801, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0009147594100795686, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.63437843322754, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.00240719155408442, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 28.077661514282227, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04892037808895111, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.494508743286133, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.04002435877919197, "pnorm/_forward_module.model.norm.weight": 25.534820556640625, "gnorm/_forward_module.model.norm.weight": 0.012046243995428085, "pnorm/_forward_module.lm_head.weight": 174.47003173828125, "gnorm/_forward_module.lm_head.weight": 0.10241372138261795} +{"step": 440401920, "pnorm/_forward_module.model.embeddings.weight": 127.4016342163086, "gnorm/_forward_module.model.embeddings.weight": 0.11076513677835464, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.06170654296875, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004029294941574335, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.513799667358398, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01381214614957571, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.47204875946045, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.020139018073678017, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.430787086486816, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12355135381221771, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.376423835754395, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12641681730747223, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.944541096687317, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.013009830377995968, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.18556486070156097, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0010258110705763102, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.605783462524414, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0022881680633872747, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.22022247314453, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.07767193764448166, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.19944953918457, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07856135070323944, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.621179580688477, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0032384961377829313, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.618587493896484, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.013946810737252235, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.550588607788086, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.02028597705066204, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.681197166442871, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08873898535966873, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.732852935791016, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07521510124206543, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.193178653717041, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013630231842398643, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.11632797867059708, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0014285554643720388, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.475997924804688, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0012849320191890001, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 26.992389678955078, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04595048353075981, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.09649085998535, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03734510391950607, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.672325134277344, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0019351942464709282, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.873587608337402, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008784042671322823, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.715360641479492, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01311560533940792, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.817234992980957, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06578682363033295, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.91893482208252, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04760182276368141, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.417750120162964, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007681317627429962, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.13722524046897888, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00038758653681725264, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.58790397644043, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0012469823705032468, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.381587982177734, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04162292182445526, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.337039947509766, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03244561329483986, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.737201690673828, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.00197144434787333, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.072874069213867, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.010059165768325329, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.871269226074219, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.013795804232358932, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.92042064666748, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.062404632568359375, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.091043472290039, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04027608036994934, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.367361545562744, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.01099981926381588, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.11999338865280151, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0011717057786881924, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.699188232421875, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0012494259281083941, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.850879669189453, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04131341725587845, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.605276107788086, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.0316472090780735, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.14288330078125, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0034407766070216894, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.490116119384766, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.01755926012992859, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.150766372680664, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03629198670387268, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.178740501403809, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06111367791891098, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.495649337768555, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.036341749131679535, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.366691827774048, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.010901357978582382, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.12553489208221436, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0014835481997579336, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.879703521728516, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001212718547321856, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 28.418354034423828, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03613821789622307, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.844619750976562, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.030756203457713127, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.763370513916016, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0018267427803948522, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.233549118041992, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.013278229162096977, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.036519050598145, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.018160704523324966, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.94697093963623, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.05144893378019333, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.197552680969238, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.028207333758473396, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.4238955974578857, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.012628191150724888, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.13559430837631226, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0014414290199056268, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.65908432006836, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0024690988939255476, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 28.22845458984375, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04656601324677467, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.58501434326172, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.0319368951022625, "pnorm/_forward_module.model.norm.weight": 25.70808219909668, "gnorm/_forward_module.model.norm.weight": 0.008805469609797001, "pnorm/_forward_module.lm_head.weight": 177.2296905517578, "gnorm/_forward_module.lm_head.weight": 0.07623133063316345} +{"step": 461373440, "pnorm/_forward_module.model.embeddings.weight": 128.1724090576172, "gnorm/_forward_module.model.embeddings.weight": 0.12523862719535828, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.048799514770508, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.005390833131968975, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.566010475158691, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.013237104751169682, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.516889572143555, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.017227182164788246, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.412449836730957, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1670149713754654, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.360329627990723, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.17639660835266113, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9455666542053223, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.012661349959671497, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.19395820796489716, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0010538218775764108, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.576231002807617, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.003232979215681553, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.208864212036133, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.10796771198511124, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.198638916015625, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.10780072212219238, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.606977462768555, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.003987675998359919, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.636107444763184, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.01595599576830864, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.565070152282715, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.02553144469857216, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.67153263092041, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1292364001274109, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.726133346557617, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.10312586277723312, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1872410774230957, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.015089266933500767, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.11862709373235703, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.000948338070884347, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.46748924255371, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0019612854812294245, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.030229568481445, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06542326509952545, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.122385025024414, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.0532471165060997, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67634391784668, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0027894650120288134, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.931078910827637, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.013544655404984951, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.756717681884766, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.024341100826859474, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.820140838623047, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.09871938079595566, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.927481651306152, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06956463307142258, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.456946611404419, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010382549837231636, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.14165529608726501, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006981990300118923, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.586442947387695, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0016269857296720147, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.440488815307617, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.0586865171790123, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.376224517822266, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0471416637301445, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.752363204956055, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002664086641743779, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.146353721618652, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.014614103361964226, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.928153991699219, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.018540602177381516, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.942986488342285, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.08965379744768143, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.123725891113281, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.06263314932584763, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.399639129638672, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.011962451972067356, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.12401176989078522, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0011897742515429854, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.69557762145996, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002032896736636758, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.91726303100586, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06059258058667183, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.64629554748535, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.048397861421108246, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.23037338256836, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.005058838985860348, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.62539291381836, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.024377651512622833, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.237906455993652, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.05449729785323143, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.237589836120605, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11371665447950363, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.592327117919922, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.05512974411249161, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.407928228378296, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.013631529174745083, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.12940861284732819, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0014630239456892014, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.917041778564453, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0019719020929187536, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 28.559476852416992, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.056435856968164444, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 19.92876434326172, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.04769906401634216, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.788619995117188, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.003709380514919758, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.321540832519531, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.022254955023527145, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.107276916503906, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.03400900959968567, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.973063468933105, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.10076599568128586, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.243049621582031, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.04722192510962486, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.4675943851470947, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.016595885157585144, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.13980677723884583, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.002076784148812294, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.68401336669922, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.004418401978909969, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 28.377302169799805, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.08198182284832001, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.675207138061523, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.05650278553366661, "pnorm/_forward_module.model.norm.weight": 25.880910873413086, "gnorm/_forward_module.model.norm.weight": 0.010104636661708355, "pnorm/_forward_module.lm_head.weight": 179.88380432128906, "gnorm/_forward_module.lm_head.weight": 0.1825292706489563} +{"step": 482344960, "pnorm/_forward_module.model.embeddings.weight": 128.91761779785156, "gnorm/_forward_module.model.embeddings.weight": 0.07941078394651413, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.038623809814453, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003282698802649975, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.628076553344727, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009418168105185032, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.569841384887695, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.011454613879323006, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.396394729614258, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10145469009876251, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.34578800201416, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10773970931768417, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9501638412475586, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.00759013881906867, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.20295970141887665, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00042762921657413244, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.5490779876709, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0020800193306058645, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.202030181884766, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06346683204174042, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.199390411376953, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06650038063526154, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.58868980407715, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0025066910311579704, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.647156715393066, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.010268224403262138, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.574130058288574, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0156781654804945, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.659266471862793, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.07625259459018707, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.716650009155273, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07127705216407776, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.178622245788574, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012812024913728237, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.11984903365373611, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0013509339187294245, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.45794105529785, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010739907156676054, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.065784454345703, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.041742466390132904, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.146100997924805, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03322777897119522, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.678531646728516, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0017709573730826378, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 11.986621856689453, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008668308146297932, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.795585632324219, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.012895965948700905, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.822261810302734, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.060405004769563675, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.935050964355469, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.0466947928071022, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.490936517715454, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008430423215031624, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.14567789435386658, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006683855899609625, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.58112907409668, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001057197107002139, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.492250442504883, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03901446610689163, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.410844802856445, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0297229066491127, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.76519012451172, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.00165904953610152, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.221142768859863, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008766558952629566, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 11.986505508422852, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.011165237985551357, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.961343765258789, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.051882535219192505, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.150932312011719, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04091638699173927, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.424013137817383, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.009269880130887032, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.12714214622974396, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008291368139907718, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.686769485473633, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0012920401059091091, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.973365783691406, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04078851267695427, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.682048797607422, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.029162226244807243, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.315710067749023, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.003245763713493943, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.761555671691895, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.01830100454390049, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.32479476928711, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.043573927134275436, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.29765510559082, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06341782957315445, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.695272445678711, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03413818031549454, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.4398341178894043, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0077997418120503426, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.13206131756305695, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0009214167948812246, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.952688217163086, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001108091906644404, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 28.69431495666504, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03169718012213707, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.00983238220215, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02624671533703804, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.81229591369629, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0012663041707128286, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.409306526184082, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008709406480193138, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.176384925842285, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.012837238609790802, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 10.99650764465332, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.040160976350307465, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.284608840942383, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.024372175335884094, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.5174713134765625, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005101846065372229, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.14461356401443481, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0005494621582329273, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.70825958251953, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0021813595667481422, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 28.52202606201172, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03639062121510506, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.76184844970703, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.028391750529408455, "pnorm/_forward_module.model.norm.weight": 26.050922393798828, "gnorm/_forward_module.model.norm.weight": 0.010517852380871773, "pnorm/_forward_module.lm_head.weight": 182.4493865966797, "gnorm/_forward_module.lm_head.weight": 0.07045239210128784} +{"step": 503316480, "pnorm/_forward_module.model.embeddings.weight": 129.635009765625, "gnorm/_forward_module.model.embeddings.weight": 0.09357499331235886, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.028308868408203, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.00429779477417469, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.695470809936523, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010200158692896366, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.62750244140625, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.011931635439395905, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.378880500793457, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13020466268062592, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.329827308654785, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.13408531248569489, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9567428827285767, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008078246377408504, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.21186669170856476, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0008454202325083315, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.522031784057617, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002170364372432232, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.194091796875, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0777055025100708, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.20043182373047, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07885469496250153, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.572345733642578, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.003222769359126687, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.658615112304688, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.014787564054131508, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.582852363586426, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.023053305223584175, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.64881706237793, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09353551268577576, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.708791732788086, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08385801315307617, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1759722232818604, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.022075429558753967, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.12166144698858261, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0023700303863734007, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.447824478149414, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013429097598418593, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.09895896911621, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05071127042174339, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.169015884399414, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04037129506468773, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.679086685180664, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.002046923851594329, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.036238670349121, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011288803070783615, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.830131530761719, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.017390495166182518, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.823448181152344, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07190453261137009, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.940119743347168, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.0589553564786911, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.5252881050109863, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.009941854514181614, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.14974819123744965, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008659526356495917, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.574708938598633, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0011679284507408738, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.542089462280273, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04564827308058739, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.4443302154541, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03652728348970413, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.77825355529785, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002037184080109, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.298134803771973, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.009877245873212814, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.046442985534668, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.012644934467971325, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.978778839111328, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06604481488466263, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.17583179473877, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.05102993920445442, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.4474117755889893, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.00910518690943718, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.13018135726451874, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008306254749186337, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.672197341918945, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0014678023289889097, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.019630432128906, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04786617308855057, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.711528778076172, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.035746607929468155, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.394445419311523, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.004217156674712896, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.88501262664795, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.019829733297228813, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.401474952697754, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.04734686762094498, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.357048034667969, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07696544378995895, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.800097465515137, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04197488725185394, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.4650626182556152, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.010916869156062603, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.13449563086032867, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.001422666129656136, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 22.98513412475586, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0013103536330163479, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 28.821144104003906, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04095424711704254, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.0872745513916, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.034986190497875214, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.836925506591797, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002148519968613982, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.496907234191895, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.012808754108846188, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.245428085327148, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.019492298364639282, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.020607948303223, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.050648823380470276, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.327685356140137, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.031762003898620605, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.5613644123077393, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.010421128012239933, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1488582193851471, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001345705590210855, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.73277473449707, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0019287464674562216, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 28.663793563842773, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04469606652855873, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.84614372253418, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03542183339595795, "pnorm/_forward_module.model.norm.weight": 26.222204208374023, "gnorm/_forward_module.model.norm.weight": 0.008269052021205425, "pnorm/_forward_module.lm_head.weight": 184.9420166015625, "gnorm/_forward_module.lm_head.weight": 0.09206656366586685} +{"step": 524288000, "pnorm/_forward_module.model.embeddings.weight": 130.3246612548828, "gnorm/_forward_module.model.embeddings.weight": 0.08718787878751755, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.019155502319336, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0035892766900360584, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.765745162963867, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.01043260470032692, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.687149047851562, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012564009055495262, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.361692428588867, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11395692825317383, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.314373970031738, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12380198389291763, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9658564329147339, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.009793245233595371, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.22093811631202698, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0010595459025353193, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.49486541748047, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001968561904504895, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.185304641723633, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06936425715684891, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.200292587280273, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06982214003801346, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.554048538208008, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0022324086166918278, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.663812637329102, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.010417691431939602, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.586557388305664, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.013971212320029736, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.636076927185059, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.0808873102068901, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.698802947998047, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07886005192995071, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.176032543182373, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010697307996451855, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1237124502658844, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006265264819376171, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.4364070892334, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0014104443835094571, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.12786865234375, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04911782965064049, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.189735412597656, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.039300717413425446, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67761993408203, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0021521225571632385, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.079867362976074, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.01214568316936493, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.859943389892578, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.018303576856851578, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.822196960449219, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07044097036123276, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.94301700592041, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05430533364415169, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.567878007888794, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.00873540248721838, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.15417517721652985, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006440966972149909, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.56618881225586, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001291750930249691, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.586475372314453, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04710237681865692, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.47489356994629, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03540424257516861, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.789806365966797, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0019171671010553837, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.373250007629395, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.009871567599475384, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.104911804199219, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.013470706529915333, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 10.993290901184082, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06493211537599564, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.196977615356445, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.048449914902448654, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.4741241931915283, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.009654193185269833, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.13339169323444366, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005450038006529212, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.652278900146484, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0025097932666540146, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.05372428894043, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.057725075632333755, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.732797622680664, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03605244681239128, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.468564987182617, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.005106343887746334, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 12.99743366241455, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.022382739931344986, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.472896575927734, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.061500176787376404, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.418662071228027, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07594190537929535, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 11.910921096801758, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03892766684293747, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.486382246017456, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.014046944677829742, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.13636769354343414, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002507086144760251, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.01252555847168, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0013176919892430305, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 28.937028884887695, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03875409811735153, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.15878677368164, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03288609907031059, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.860864639282227, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002323599997907877, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.581269264221191, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.01610160805284977, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.31103801727295, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.025142844766378403, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.043536186218262, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06123049929738045, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.369857788085938, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03061932511627674, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.604661703109741, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.011684687808156013, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1532377004623413, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0015633400762453675, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.757295608520508, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0035232477821409702, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 28.801347732543945, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.06292885541915894, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 19.928030014038086, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.04750709608197212, "pnorm/_forward_module.model.norm.weight": 26.392784118652344, "gnorm/_forward_module.model.norm.weight": 0.006388008128851652, "pnorm/_forward_module.lm_head.weight": 187.33482360839844, "gnorm/_forward_module.lm_head.weight": 0.0961197018623352} +{"step": 545259520, "pnorm/_forward_module.model.embeddings.weight": 130.9851531982422, "gnorm/_forward_module.model.embeddings.weight": 0.09866008162498474, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.010822296142578, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004089560825377703, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.840600967407227, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010384276509284973, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.751606941223145, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012585465796291828, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.344374656677246, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13228513300418854, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.298503875732422, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1396215409040451, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9735642671585083, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.010566944256424904, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.22870320081710815, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0005894952337257564, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.469507217407227, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0023533094208687544, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.17711067199707, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.08006951212882996, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.2001953125, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07893776148557663, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.537981033325195, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0027331802994012833, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.671140670776367, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.011806734837591648, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.591340065002441, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.016273802146315575, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.624966621398926, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09463205933570862, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.689739227294922, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09234125167131424, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.177952289581299, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.017008071765303612, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.12554606795310974, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0018205269007012248, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.424983978271484, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0016119088977575302, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.153779983520508, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05514190346002579, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.208681106567383, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04312380030751228, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.677536010742188, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.002266249153763056, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.125182151794434, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011184202507138252, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.889050483703613, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.015512706711888313, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.821208000183105, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.08082005381584167, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.945377349853516, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06610430777072906, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.603516101837158, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.011960332281887531, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.15766242146492004, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0011429593432694674, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.559635162353516, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0016767495544627309, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.629194259643555, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05404861271381378, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.503725051879883, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0419466570019722, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.801664352416992, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0025711010675877333, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.451581954956055, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.013163045980036259, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.16599178314209, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.017794452607631683, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.005953788757324, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.08220379799604416, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.214759826660156, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.06008578836917877, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.49410080909729, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.014011350460350513, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.13564267754554749, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0017591594951227307, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.629804611206055, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0020516999065876007, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.078433990478516, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05817018076777458, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.746984481811523, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04044386371970177, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.52790641784668, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.004822211340069771, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.090774536132812, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.021762659773230553, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.534430503845215, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.05578288808465004, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.473750114440918, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.09728507697582245, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.0155668258667, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04584130272269249, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.502387285232544, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.011761712841689587, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.13796770572662354, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.001554311253130436, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.039270401000977, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0014152801595628262, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.048904418945312, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.044542036950588226, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.22747802734375, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.038972947746515274, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.88224220275879, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0022819829173386097, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.665375709533691, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.0186097901314497, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.377079010009766, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.025023356080055237, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.06595516204834, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.05892543867230415, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.410065650939941, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03527301549911499, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.639575719833374, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.022267838940024376, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1564987748861313, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0025904770009219646, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.78061294555664, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.002335346769541502, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 28.9324951171875, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.049437109380960464, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.006345748901367, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.039113517850637436, "pnorm/_forward_module.model.norm.weight": 26.560176849365234, "gnorm/_forward_module.model.norm.weight": 0.009667621925473213, "pnorm/_forward_module.lm_head.weight": 189.63722229003906, "gnorm/_forward_module.lm_head.weight": 0.09062054008245468} +{"step": 566231040, "pnorm/_forward_module.model.embeddings.weight": 131.616943359375, "gnorm/_forward_module.model.embeddings.weight": 0.09257137775421143, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 23.002197265625, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004241005051881075, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.91598129272461, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010182862170040607, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.815650939941406, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.011858198791742325, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.325624465942383, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1274988204240799, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.281806945800781, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.13117951154708862, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9838422536849976, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008924507535994053, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.23655036091804504, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0010059983469545841, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.44268035888672, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0023919311352074146, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.16490936279297, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.07394254207611084, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.197500228881836, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07484568655490875, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.5207576751709, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0026270339731127024, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.681082725524902, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.011670769192278385, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.599470138549805, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.01925392635166645, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.610475540161133, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08974747359752655, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.67731761932373, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08121136575937271, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1834654808044434, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013904587365686893, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.12758590281009674, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007926201215013862, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.415035247802734, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00142424157820642, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.17845344543457, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05246545001864433, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.22659683227539, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.040093131363391876, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67397689819336, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.002328513190150261, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.164811134338379, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012087262235581875, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.915053367614746, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.018740398809313774, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.8154296875, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07583335787057877, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.94276237487793, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05822914093732834, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.638157367706299, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.013864818960428238, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.16086547076702118, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007925962563604116, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.55381202697754, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0013794010737910867, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.66927146911621, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04926542565226555, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.53152084350586, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03743276745080948, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.812376022338867, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0019770576618611813, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.52827262878418, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00947288516908884, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.226068496704102, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.013163702562451363, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.014494895935059, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06827860325574875, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.228662490844727, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.052021004259586334, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.5103113651275635, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.010600379668176174, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.13697491586208344, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0012660971842706203, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.607025146484375, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001842746278271079, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.096996307373047, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05720341578125954, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.75773811340332, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03783155605196953, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.58294105529785, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.00474676163867116, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.173293113708496, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.021747080609202385, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.590808868408203, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.06024261936545372, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.529168128967285, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.09579355269670486, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.124587059020996, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04195033386349678, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.516636371612549, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.012874141335487366, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1391449123620987, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0017544724978506565, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.064897537231445, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001527615706436336, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.154083251953125, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04257133975625038, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.291881561279297, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03556307032704353, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.90379524230957, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.00222900346852839, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.739968299865723, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.014222950674593449, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.436185836791992, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.0247640460729599, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.08928108215332, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06532065570354462, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.451973915100098, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03269795700907707, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.6775357723236084, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.01236772257834673, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.15983805060386658, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0015891651855781674, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.803518295288086, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0023891828022897243, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.057613372802734, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.050376880913972855, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.079425811767578, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.036837853491306305, "pnorm/_forward_module.model.norm.weight": 26.72469139099121, "gnorm/_forward_module.model.norm.weight": 0.010827641934156418, "pnorm/_forward_module.lm_head.weight": 191.869384765625, "gnorm/_forward_module.lm_head.weight": 0.11172198504209518} +{"step": 587202560, "pnorm/_forward_module.model.embeddings.weight": 132.2198486328125, "gnorm/_forward_module.model.embeddings.weight": 0.08466605842113495, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.993871688842773, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0034304216969758272, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 13.985758781433105, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010022037662565708, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.875059127807617, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01246686838567257, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.307923316955566, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11080293357372284, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.265799522399902, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1179494708776474, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 1.9922033548355103, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008832824416458607, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.24362407624721527, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007083591190166771, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.42000961303711, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0019828847143799067, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.15715980529785, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06831581145524979, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.196374893188477, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06779512017965317, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.504674911499023, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002034249948337674, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.691499710083008, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.010650614276528358, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.608168601989746, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.015366698615252972, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.596370697021484, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.07792558521032333, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.665386199951172, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07603589445352554, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.1879355907440186, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013391397893428802, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.12948215007781982, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008732040878385305, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.40667152404785, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001271469402126968, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.202617645263672, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04833031818270683, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.243663787841797, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03774266690015793, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.670780181884766, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0017527805175632238, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.200374603271484, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.00908010546118021, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.939257621765137, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.012638548389077187, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.811546325683594, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06713636219501495, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.941020965576172, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05391576141119003, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.6614010334014893, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008641388267278671, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.16312715411186218, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007683138246648014, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.548702239990234, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001451854594051838, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.70584487915039, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04754660651087761, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.557207107543945, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03594350442290306, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.825590133666992, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001962720649316907, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.608642578125, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008342716842889786, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.290182113647461, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.012195616029202938, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.02466106414795, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06873508542776108, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.24301528930664, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04984309896826744, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.5239193439483643, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.008120325393974781, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1382778286933899, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005642768810503185, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.5872802734375, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.003617769805714488, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.1104679107666, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06855738908052444, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.76482582092285, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04044617712497711, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.621042251586914, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.006745758466422558, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.237469673156738, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.03647832199931145, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.639154434204102, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.0770118311047554, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.578289985656738, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.08561540395021439, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.224433898925781, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.037459395825862885, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.5290982723236084, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.011814338155090809, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14033697545528412, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0019129421561956406, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.086326599121094, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0013586321147158742, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.24787712097168, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.040182214230298996, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.348831176757812, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03227870166301727, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.925457000732422, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001972934463992715, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.814481735229492, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.012482201680541039, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.493943214416504, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.018223850056529045, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.112107276916504, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06287754327058792, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.493610382080078, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.029877040535211563, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.7098093032836914, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.008406882174313068, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.16264715790748596, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0010658090468496084, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.826942443847656, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0030368538573384285, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.175853729248047, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.054855670779943466, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.149642944335938, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.04392606392502785, "pnorm/_forward_module.model.norm.weight": 26.88644790649414, "gnorm/_forward_module.model.norm.weight": 0.006414105650037527, "pnorm/_forward_module.lm_head.weight": 194.02978515625, "gnorm/_forward_module.lm_head.weight": 0.07362798601388931} +{"step": 608174080, "pnorm/_forward_module.model.embeddings.weight": 132.7937774658203, "gnorm/_forward_module.model.embeddings.weight": 0.09324617683887482, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.982986450195312, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003494745586067438, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.049301147460938, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007852527312934399, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.929021835327148, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009071563370525837, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.287379264831543, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11840179562568665, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.24687385559082, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1226731613278389, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0000369548797607, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006339225452393293, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.25010600686073303, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00037607696140185, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.39397430419922, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0019934631418436766, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.14256477355957, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.07063397765159607, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.19086456298828, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08344034850597382, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.49014663696289, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0026774427387863398, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.703042030334473, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.010726544074714184, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.617230415344238, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.01696922816336155, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.58362865447998, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08981208503246307, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.654370307922363, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08598429709672928, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.18705415725708, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01251278817653656, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13053485751152039, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0011396242771297693, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.395992279052734, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001521183643490076, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.220935821533203, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05373520031571388, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.25668716430664, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04495440050959587, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.667098999023438, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0020174921955913305, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.231640815734863, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009911121800541878, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.960105895996094, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.014930813573300838, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.807196617126465, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07760138809680939, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.938215255737305, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06159059330821037, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.6775355339050293, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.011447370983660221, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.16438478231430054, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009011897491291165, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.5418643951416, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001857844996266067, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.73666763305664, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05617256462574005, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.580018997192383, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04491807147860527, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.83747100830078, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002078188117593527, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.682576179504395, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.01001430582255125, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.348503112792969, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01324539352208376, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.033214569091797, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.0779595673084259, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.255447387695312, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.058040834963321686, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.5319933891296387, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.01112950686365366, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1387225240468979, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008082672138698399, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.565059661865234, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.005388081539422274, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.11310577392578, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.09838428348302841, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.764129638671875, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.058750614523887634, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.663280487060547, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.010053387843072414, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.305691719055176, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.08825492113828659, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.690781593322754, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.13109861314296722, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.628827095031738, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11418887227773666, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.331921577453613, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.035602446645498276, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.5377237796783447, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.018618421629071236, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14095674455165863, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0034696385264396667, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.105987548828125, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0012278016656637192, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.334823608398438, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.036237411201000214, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.401865005493164, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03090221807360649, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.94660186767578, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0018456674879416823, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.884806632995605, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.013428877107799053, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.547882080078125, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.018325606361031532, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.13548755645752, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.051216933876276016, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.53606128692627, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.026097623631358147, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.7310571670532227, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.015668107196688652, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.16479219496250153, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0019545224495232105, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.850759506225586, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0020487052388489246, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.290739059448242, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03926685079932213, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.217151641845703, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.031516872346401215, "pnorm/_forward_module.model.norm.weight": 27.044815063476562, "gnorm/_forward_module.model.norm.weight": 0.008031395263969898, "pnorm/_forward_module.lm_head.weight": 196.09226989746094, "gnorm/_forward_module.lm_head.weight": 0.08150552213191986} +{"step": 629145600, "pnorm/_forward_module.model.embeddings.weight": 133.3385772705078, "gnorm/_forward_module.model.embeddings.weight": 0.10836733877658844, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.974563598632812, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004690782632678747, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.113333702087402, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.012650684453547001, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 13.983172416687012, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.015555073507130146, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.269970893859863, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1544216424226761, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.231014251708984, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.1499316543340683, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0074679851531982, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.01250538881868124, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.25581094622612, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0009166421950794756, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.37277603149414, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002318240934982896, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.13494110107422, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.08529126644134521, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.189340591430664, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.09264523535966873, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.47748565673828, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0032654614187777042, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.719788551330566, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.012197526171803474, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.631291389465332, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.022149186581373215, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.571518898010254, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.11056976765394211, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.643708229064941, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09578881412744522, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.193901777267456, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.015614592470228672, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13215608894824982, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009852549992501736, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.38869857788086, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0016956684412434697, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.24299430847168, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.06243203580379486, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.272274017333984, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.0499282144010067, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.66256332397461, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.00281900935806334, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.258501052856445, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012032398022711277, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.978117942810059, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.02391112595796585, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.802173614501953, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.09776932001113892, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.934769630432129, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06709928810596466, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.6955175399780273, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010648575611412525, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.16610410809516907, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009204015950672328, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.537900924682617, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0019029694376513362, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.769189834594727, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05620720237493515, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.60368537902832, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04538873955607414, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.84868621826172, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0024154481943696737, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.756836891174316, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.009764833375811577, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.407959938049316, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.0147289102897048, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.039427757263184, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.08897218853235245, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.265048027038574, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.0627334713935852, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.543116807937622, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.011759890243411064, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.13941286504268646, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0012215664610266685, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.54686737060547, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004831778351217508, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.115848541259766, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08649338036775589, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.763137817382812, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.05610034614801407, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.695987701416016, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.009589050896465778, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.366514205932617, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.07373480498790741, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.738666534423828, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.1178293228149414, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.673970222473145, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.1159856840968132, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.429946899414062, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04170767590403557, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.546757698059082, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.02106037549674511, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14174695312976837, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.003995003644376993, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.124046325683594, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001438274746760726, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.416336059570312, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04187346249818802, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.452037811279297, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03597191348671913, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.966150283813477, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002060454338788986, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 12.95345401763916, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.011093342676758766, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.601015090942383, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01517839077860117, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.157509803771973, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06098154932260513, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.576085090637207, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03079625405371189, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.7534050941467285, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.013347704894840717, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.16677144169807434, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0017338368343189359, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.873069763183594, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0016621540999040008, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.397188186645508, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03955509141087532, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.280237197875977, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.032456040382385254, "pnorm/_forward_module.model.norm.weight": 27.198617935180664, "gnorm/_forward_module.model.norm.weight": 0.00675298972055316, "pnorm/_forward_module.lm_head.weight": 198.05392456054688, "gnorm/_forward_module.lm_head.weight": 0.07027759402990341} +{"step": 650117120, "pnorm/_forward_module.model.embeddings.weight": 133.85455322265625, "gnorm/_forward_module.model.embeddings.weight": 0.09970211982727051, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.96574592590332, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004195492714643478, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.170971870422363, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011438763700425625, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.031952857971191, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.015019737184047699, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.252745628356934, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1420404613018036, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.215036392211914, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.14201407134532928, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.013953924179077, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.011194159276783466, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.2606754004955292, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00040203757816925645, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.352523803710938, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002484887605533004, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.127490997314453, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0817767009139061, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.187217712402344, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08855811506509781, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.46613883972168, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002913471544161439, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.737299919128418, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.011170684359967709, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.644852638244629, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.01703702285885811, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.560013771057129, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09987596422433853, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.63351821899414, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08720856159925461, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2025234699249268, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012439129874110222, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13377317786216736, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010997412027791142, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.38090705871582, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0018190500559285283, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.261865615844727, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.058157749474048615, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.28566551208496, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.046226900070905685, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.660293579101562, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0022633341141045094, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.287147521972656, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011948765255510807, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 11.997123718261719, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01982947252690792, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.798735618591309, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.08062399178743362, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.933127403259277, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06034021079540253, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.715135097503662, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010950464755296707, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.16797882318496704, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0013053604634478688, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.53336524963379, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0018183885840699077, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.79769515991211, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05581940710544586, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.624948501586914, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04442049190402031, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.860965728759766, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002422739053145051, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.827947616577148, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.010530954226851463, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.464631080627441, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.015794388949871063, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.047651290893555, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.08449961990118027, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.276342391967773, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.058088939636945724, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.554626226425171, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.01034450065344572, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.14027006924152374, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005988876800984144, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.53005599975586, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004483302589505911, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.115646362304688, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08249295502901077, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.761119842529297, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.0570177398622036, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.72606658935547, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.009209664538502693, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.422709465026855, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.06558062881231308, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.783963203430176, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.10313582420349121, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.713570594787598, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.13215680420398712, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.521140098571777, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.044025763869285583, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.5541129112243652, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.020528966560959816, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14235559105873108, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.004015712533146143, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.139238357543945, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0014953905483707786, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.489532470703125, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04719872772693634, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.497446060180664, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.038936685770750046, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 22.986154556274414, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002269535791128874, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.016969680786133, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.01553372759371996, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.649957656860352, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.02458859421312809, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.180953025817871, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06933137029409409, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.61815357208252, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03522561863064766, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.7769992351531982, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.01943039707839489, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1689138412475586, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0027445531450212, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.89436912536621, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0025103711523115635, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.496397018432617, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05317879468202591, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.33913230895996, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.04049231484532356, "pnorm/_forward_module.model.norm.weight": 27.34844398498535, "gnorm/_forward_module.model.norm.weight": 0.006223059259355068, "pnorm/_forward_module.lm_head.weight": 199.92138671875, "gnorm/_forward_module.lm_head.weight": 0.10315292328596115} +{"step": 671088640, "pnorm/_forward_module.model.embeddings.weight": 134.34368896484375, "gnorm/_forward_module.model.embeddings.weight": 0.0846937745809555, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.960023880004883, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003437833394855261, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.230409622192383, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009906584396958351, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.082666397094727, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012052349746227264, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.23956298828125, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12025292217731476, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.203167915344238, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12798042595386505, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0208969116210938, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.010388758964836597, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.26542940735816956, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0019085647072643042, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.33293914794922, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0020618666894733906, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.120115280151367, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.07124169915914536, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.184185028076172, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08709387481212616, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.455699920654297, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002756789093837142, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.753728866577148, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.010076207108795643, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.658032417297363, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.01671478897333145, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.54946231842041, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09131456911563873, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.623910903930664, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.08907715976238251, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2140066623687744, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.012817910872399807, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13589230179786682, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0013894832227379084, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.3750057220459, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0016371897654607892, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.281246185302734, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.053360357880592346, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.298925399780273, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.048127397894859314, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.65839958190918, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0021985783241689205, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.317134857177734, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010844511911273003, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.017585754394531, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01712421327829361, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.79453182220459, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07893633842468262, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.929996490478516, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.06411401927471161, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.737887382507324, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010321340523660183, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.17011654376983643, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008286783122457564, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.53103256225586, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0021373501513153315, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.826196670532227, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05719630792737007, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.645950317382812, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0493239089846611, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.872238159179688, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0022966659162193537, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.897600173950195, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.009323777630925179, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.520064353942871, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01317282672971487, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.053683280944824, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.07927346974611282, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.285006523132324, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.06342759728431702, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.5672054290771484, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.009879220277071, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.14156347513198853, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.001128380885347724, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.51361846923828, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.010579629801213741, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.111576080322266, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.1473333090543747, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.756555557250977, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.08773857355117798, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.754615783691406, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.01797524280846119, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.480687141418457, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.15974865853786469, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.830183982849121, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.23904019594192505, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.749584197998047, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.10495985299348831, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.607535362243652, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03531548008322716, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.559389591217041, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.029821598902344704, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1428786963224411, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0058329044841229916, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.151151657104492, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001184387831017375, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.55353355407715, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03511161729693413, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.53696632385254, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.029869263991713524, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.005184173583984, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001977429259568453, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.077054977416992, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.013847687281668186, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.695968627929688, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.02086167223751545, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.203461647033691, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04887531325221062, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.658379554748535, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.025423688814044, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.797100305557251, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.01776924915611744, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.17091162502765656, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0019246295560151339, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.910486221313477, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.00213628844358027, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.579984664916992, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04139848053455353, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.388519287109375, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03337901458144188, "pnorm/_forward_module.model.norm.weight": 27.492111206054688, "gnorm/_forward_module.model.norm.weight": 0.007569338660687208, "pnorm/_forward_module.lm_head.weight": 201.6817626953125, "gnorm/_forward_module.lm_head.weight": 0.0697295293211937} +{"step": 692060160, "pnorm/_forward_module.model.embeddings.weight": 134.8092803955078, "gnorm/_forward_module.model.embeddings.weight": 0.08929193764925003, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.95271873474121, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003702018177136779, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.287496566772461, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009377451613545418, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.130566596984863, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01139868050813675, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.22375202178955, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12306412309408188, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.188481330871582, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12143574655056, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0259532928466797, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008768623694777489, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.268900066614151, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006963612977415323, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.314979553222656, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001863899640738964, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.113679885864258, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06723637878894806, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.1821346282959, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07045397907495499, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.444143295288086, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0023465261328965425, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.767558097839355, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.009487834759056568, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.6689453125, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.013917691074311733, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.537044525146484, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08402647823095322, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.61259651184082, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0735391154885292, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2248544692993164, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011156493797898293, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1377658247947693, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00043780551641248167, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.370161056518555, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013803554465994239, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.30032730102539, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.049490608274936676, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.31199836730957, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03818056359887123, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.656328201293945, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0020427382551133633, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.342205047607422, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010822472162544727, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.033893585205078, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.016468271613121033, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.790641784667969, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06933659315109253, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.927129745483398, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05107486620545387, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.759277582168579, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.011231140233576298, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.17191559076309204, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009110230021178722, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.530750274658203, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0013648406602442265, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.855342864990234, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04589239880442619, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.667016983032227, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03618251904845238, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.883272171020508, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001859284471720457, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 12.962814331054688, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008915445767343044, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.572606086730957, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01283302716910839, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.058589935302734, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06489621102809906, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.293240547180176, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04664922133088112, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.580382823944092, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.009206647984683514, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.14251630008220673, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0010339347645640373, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.502357482910156, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002086567459627986, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.111757278442383, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05230613052845001, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.755197525024414, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.042504873126745224, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.773799896240234, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.005116415675729513, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.52672004699707, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.03278525546193123, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.868165969848633, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.05273598060011864, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.782397270202637, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.10807187110185623, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.684576034545898, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03691733628511429, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.5662596225738525, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.016490206122398376, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1434997171163559, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002536082174628973, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.165729522705078, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0012231811415404081, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.617046356201172, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03892673924565315, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.576860427856445, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03922191634774208, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.024139404296875, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0030940501019358635, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.130508422851562, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.03363807871937752, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.736977577209473, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.04354558140039444, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.227702140808105, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0637601986527443, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.701126098632812, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03448743745684624, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.818361759185791, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.047110579907894135, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.17286644876003265, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.006085975095629692, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.92862892150879, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.002984287915751338, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.663928985595703, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05215289071202278, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.439010620117188, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03861791267991066, "pnorm/_forward_module.model.norm.weight": 27.63249969482422, "gnorm/_forward_module.model.norm.weight": 0.006089869886636734, "pnorm/_forward_module.lm_head.weight": 203.3670654296875, "gnorm/_forward_module.lm_head.weight": 0.0928235799074173} +{"step": 713031680, "pnorm/_forward_module.model.embeddings.weight": 135.25180053710938, "gnorm/_forward_module.model.embeddings.weight": 0.0843285322189331, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.942996978759766, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003430221462622285, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.337125778198242, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010223323479294777, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.172155380249023, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.01344671193510294, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.205398559570312, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11585825681686401, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.171051025390625, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11454811692237854, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0283870697021484, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.009221279993653297, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.2718670666217804, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.000892713840585202, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.29642677307129, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0018370113102719188, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.10466194152832, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06413992494344711, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.178146362304688, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07044485211372375, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.433055877685547, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002148950705304742, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.78374195098877, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.009820623323321342, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.680913925170898, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.014268044382333755, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.524354934692383, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.07715494930744171, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.60086441040039, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07190807163715363, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.230830430984497, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01210275199264288, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.13904543220996857, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009636766626499593, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.365514755249023, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0012127895606681705, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.317771911621094, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.047282226383686066, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.323959350585938, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03823622688651085, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.654603958129883, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0019197195069864392, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.36749267578125, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.00931661669164896, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.050914764404297, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.014471595175564289, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.786714553833008, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06824307888746262, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.924304962158203, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.052845802158117294, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.7750260829925537, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008208861574530602, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1728971004486084, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008198431460186839, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.53038787841797, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0014916908694431186, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.882265090942383, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04683845117688179, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.6868896484375, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03774050995707512, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.895828247070312, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0018206179374828935, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.028973579406738, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008112844079732895, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.62612247467041, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.011301987804472446, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.064562797546387, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06587550044059753, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.30220890045166, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.05009397864341736, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.5926127433776855, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006535707972943783, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1433146893978119, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0004406883963383734, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.488460540771484, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0033001156989485025, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.102720260620117, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06384614109992981, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.748334884643555, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.0492081455886364, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.79669761657715, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0071243117563426495, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.573709487915039, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.052821096032857895, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.906181335449219, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.08689448237419128, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.815587043762207, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.12187599390745163, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.764604568481445, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03672080114483833, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.5706210136413574, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.014068582095205784, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14384454488754272, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0021231865976005793, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.177291870117188, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0013400536263361573, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.672954559326172, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04063091427087784, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.612407684326172, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03430015221238136, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.043554306030273, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.002261859131976962, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.182646751403809, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.017538679763674736, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.77649211883545, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.02552494965493679, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.252890586853027, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06143781170248985, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.745694160461426, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.030048469081521034, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.842107057571411, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.023395542055368423, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.17508681118488312, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0030105591285973787, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.945880889892578, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0025402538012713194, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.74236297607422, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.049291957169771194, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.487258911132812, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03721962124109268, "pnorm/_forward_module.model.norm.weight": 27.771671295166016, "gnorm/_forward_module.model.norm.weight": 0.007488827686756849, "pnorm/_forward_module.lm_head.weight": 204.98007202148438, "gnorm/_forward_module.lm_head.weight": 0.08621737360954285} +{"step": 734003200, "pnorm/_forward_module.model.embeddings.weight": 135.67218017578125, "gnorm/_forward_module.model.embeddings.weight": 0.09797845035791397, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.935222625732422, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0042325761169195175, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.385210990905762, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010008271783590317, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.21323013305664, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012262441217899323, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.189654350280762, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.13689392805099487, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.15589427947998, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.13134519755840302, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.030973196029663, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.009610936976969242, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.27541565895080566, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.000648517336230725, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.279727935791016, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002154143527150154, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.098203659057617, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.07807356864213943, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.1755313873291, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0863145962357521, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.423696517944336, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0029226297046989202, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.800469398498535, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.011845839209854603, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.694205284118652, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.019389821216464043, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.512797355651855, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09587068110704422, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.58995246887207, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0819927304983139, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2381293773651123, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01637057587504387, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.14024761319160461, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0014985011657699943, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.36261558532715, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0015060951700434089, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.337013244628906, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05649150162935257, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.337108612060547, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.045231644064188004, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.65395736694336, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.002409205539152026, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.393370628356934, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.01227942667901516, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.068059921264648, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.02117595262825489, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.783492088317871, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.0810232013463974, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.922270774841309, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.056500144302845, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.7944509983062744, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.01150998380035162, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.17465318739414215, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007179492968134582, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.529285430908203, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001459812861867249, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.90689468383789, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05392614006996155, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.70524024963379, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04434484988451004, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.90747833251953, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0021234890446066856, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.091876029968262, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008881035260856152, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.675975799560547, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01366397924721241, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.070077896118164, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.07936809211969376, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.310572624206543, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.054264724254608154, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.602910280227661, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.008299482986330986, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.14377465844154358, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006166360690258443, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.474451065063477, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0024151629768311977, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.09170913696289, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06485194712877274, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.740224838256836, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06089286878705025, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.817415237426758, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.007204937282949686, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.623347282409668, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.04574649780988693, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.946109771728516, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.07227539271116257, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.845100402832031, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.15660282969474792, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.836613655090332, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.0376940593123436, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.574814558029175, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.009147176519036293, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14430370926856995, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0010895373998209834, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.189128875732422, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0013427375815808773, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.727130889892578, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04020010307431221, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.646780014038086, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03395417705178261, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.062808990478516, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0020382797811180353, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.233829498291016, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.011584911495447159, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.81599235534668, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.016036294400691986, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.277209281921387, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.05971907824277878, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.789109230041504, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.02824847400188446, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.8578908443450928, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.013760969042778015, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1764129102230072, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0018451636424288154, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.96306800842285, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0024017433170229197, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.816665649414062, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04546718671917915, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.533390045166016, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03587469086050987, "pnorm/_forward_module.model.norm.weight": 27.909011840820312, "gnorm/_forward_module.model.norm.weight": 0.0077365986071527, "pnorm/_forward_module.lm_head.weight": 206.50244140625, "gnorm/_forward_module.lm_head.weight": 0.0653042197227478} +{"step": 754974720, "pnorm/_forward_module.model.embeddings.weight": 136.07118225097656, "gnorm/_forward_module.model.embeddings.weight": 0.09665370732545853, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.928749084472656, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.004038245417177677, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.43343734741211, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011147202923893929, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.254497528076172, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.013266954571008682, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.17577838897705, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.14149537682533264, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.142879486083984, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.14144586026668549, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.033261299133301, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.009792673401534557, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.27849239110946655, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00156656454782933, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.264204025268555, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.002007000846788287, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.093563079833984, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0789017304778099, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.1737117767334, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08237800002098083, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.415058135986328, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0029452999588102102, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.818763732910156, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.012610075995326042, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.708589553833008, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.018520155921578407, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.502131462097168, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.1029830053448677, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.579854965209961, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.09005337953567505, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2487385272979736, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.021148694679141045, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.14180296659469604, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.003473537042737007, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.361530303955078, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0018854454392567277, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.35826301574707, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.059692032635211945, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.35123062133789, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.046181656420230865, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.653247833251953, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.003549326444044709, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.419600486755371, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.01933566853404045, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.085742950439453, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.03172016888856888, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.780291557312012, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.09428482502698898, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.91946792602539, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05947402864694595, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.8112356662750244, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.011218794621527195, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.17617270350456238, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0011402466334402561, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.53093719482422, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0016213735798373818, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.934123992919922, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.053675755858421326, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.72575569152832, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0413532480597496, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.92035484313965, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.002221454866230488, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.158726692199707, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.010351826436817646, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.730569839477539, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.013927721418440342, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.076579093933105, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.07810936868190765, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.319425582885742, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.054055262356996536, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.6128735542297363, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.011155880987644196, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1445290595293045, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.001297173323109746, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.46291160583496, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.005117642227560282, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.086503982543945, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08388591557741165, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.737369537353516, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.055967479944229126, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.835233688354492, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.009401131421327591, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.668595314025879, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.06999868899583817, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 12.983047485351562, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.11377683281898499, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.87049388885498, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11390859633684158, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.899470329284668, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.04136918485164642, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.581475257873535, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.016778109595179558, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14490912854671478, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0029635983519256115, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.20096778869629, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001547745312564075, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.779525756835938, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04443550109863281, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.67987632751465, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.036157261580228806, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.081758499145508, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0023551390040665865, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.28552532196045, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.014331337064504623, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.854262351989746, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.023810744285583496, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.30121898651123, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.06704218685626984, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.832493782043457, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.030763130635023117, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.872176170349121, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.015274704433977604, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.17765051126480103, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0018217448377981782, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.97845458984375, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.00269183237105608, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.88664436340332, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05265451967716217, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.57429313659668, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.04224680736660957, "pnorm/_forward_module.model.norm.weight": 28.04055404663086, "gnorm/_forward_module.model.norm.weight": 0.005349005572497845, "pnorm/_forward_module.lm_head.weight": 207.9368896484375, "gnorm/_forward_module.lm_head.weight": 0.08904996514320374} +{"step": 775946240, "pnorm/_forward_module.model.embeddings.weight": 136.4510498046875, "gnorm/_forward_module.model.embeddings.weight": 0.06702663749456406, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.924442291259766, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002772872569039464, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.48292350769043, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007852975279092789, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.29662799835205, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009227829985320568, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.16481876373291, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09252835065126419, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.132954597473145, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.09666883200407028, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.037780523300171, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006196698173880577, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.28171810507774353, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0008407388813793659, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.25069236755371, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00144023762550205, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.0922908782959, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05162239819765091, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.17380714416504, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05353643372654915, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.406391143798828, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.001691583194769919, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.838111877441406, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00833314098417759, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.723450660705566, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.011154571548104286, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.491730690002441, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06417413800954819, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.569997787475586, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.061382997781038284, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.256359100341797, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.01023099198937416, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1432262659072876, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001046627527102828, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.357324600219727, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001106876297853887, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.375213623046875, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03975257650017738, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.36290740966797, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.0312957763671875, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.652956008911133, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0015097418799996376, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.446064949035645, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008198934607207775, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.104392051696777, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.0118746692314744, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.77802562713623, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05310474708676338, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.917634963989258, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04326550289988518, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.8287906646728516, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007582833990454674, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.17783313989639282, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009439038694836199, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.5314998626709, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.00109114870429039, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.959911346435547, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.038537293672561646, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.745296478271484, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.030721524730324745, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.9329891204834, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0015021024737507105, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.22321891784668, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006807069759815931, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.783366203308105, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.010234990157186985, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.081950187683105, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05122341960668564, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.32691764831543, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03892672061920166, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.6285645961761475, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0059224264696240425, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.145813450217247, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005452464683912694, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.45343589782715, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001834383117966354, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.082847595214844, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04507686570286751, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.734317779541016, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03906684368848801, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.847808837890625, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.004432880785316229, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.710033416748047, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.034978367388248444, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.018083572387695, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.04563557356595993, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.894393920898438, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.09197548776865005, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 12.958243370056152, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02964828349649906, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.5882375240325928, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.006567356642335653, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14560608565807343, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0004977292264811695, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.21078109741211, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009657903574407101, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.82769775390625, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.031321533024311066, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.711078643798828, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.026971373707056046, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.102161407470703, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0013352916575968266, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.338165283203125, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00841483473777771, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.893717765808105, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.012700546532869339, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.324860572814941, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03870975971221924, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.875473022460938, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.02258933149278164, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.8886866569519043, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.010698539204895496, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1792582869529724, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001250137691386044, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 22.991636276245117, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0017109822947531939, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 29.94814682006836, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.035355377942323685, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.612878799438477, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.029855884611606598, "pnorm/_forward_module.model.norm.weight": 28.16875648498535, "gnorm/_forward_module.model.norm.weight": 0.006586446426808834, "pnorm/_forward_module.lm_head.weight": 209.2964630126953, "gnorm/_forward_module.lm_head.weight": 0.05597059428691864} +{"step": 796917760, "pnorm/_forward_module.model.embeddings.weight": 136.81356811523438, "gnorm/_forward_module.model.embeddings.weight": 0.08052036166191101, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.9164981842041, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0034140809439122677, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.528804779052734, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008100314997136593, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.335999488830566, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009168094024062157, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.149456977844238, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.12190574407577515, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.118474960327148, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11563693732023239, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0381813049316406, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.00678676925599575, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.2840569019317627, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0009607343818061054, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.232818603515625, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0018513007089495659, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.083593368530273, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06649553775787354, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.169845581054688, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07527559250593185, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.39675521850586, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0025563635863363743, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.854055404663086, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.009285985492169857, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.735601425170898, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.016212014481425285, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.480488777160645, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08777830004692078, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.5595703125, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06972800195217133, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2647485733032227, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010953822173178196, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1445082426071167, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009063383913598955, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.353683471679688, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013597582001239061, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.39214324951172, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04984012991189957, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.374515533447266, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.040256716310977936, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.65257453918457, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0019008142407983541, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.470624923706055, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010694299824535847, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.121496200561523, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01677878201007843, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.77488899230957, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.0706184133887291, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.915802955627441, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04907408356666565, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.8470351696014404, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010611104778945446, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.17924396693706512, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009145158692263067, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.530994415283203, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0015618964098393917, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 27.982799530029297, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.047711893916130066, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.76336669921875, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.039370957762002945, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.944488525390625, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0018061203882098198, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.284429550170898, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.007812881842255592, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.833401679992676, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01265917532145977, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.08626651763916, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06614292412996292, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.334331512451172, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.047124557197093964, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.641245126724243, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.007162606343626976, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.14668002724647522, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007564174593426287, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.442354202270508, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.003691149177029729, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.073816299438477, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06233417987823486, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.728336334228516, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06009867042303085, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.864547729492188, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.007935965433716774, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.750018119812012, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.049821723252534866, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.050531387329102, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.07849938422441483, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.920889854431152, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.14597684144973755, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.024733543395996, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.035511769354343414, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.5918869972229004, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.012190493755042553, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1458360254764557, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002054681070148945, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.220924377441406, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0011634617112576962, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.874568939208984, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03737368807196617, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.741676330566406, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03141983598470688, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.123369216918945, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001929140416905284, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.387592315673828, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.012281330302357674, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.930367469787598, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.02187546156346798, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.350813865661621, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.058840278536081314, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.921873092651367, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.027115095406770706, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.909567356109619, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.012964163906872272, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.18104052543640137, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001661529066041112, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.00706672668457, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0027390264440327883, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.0133056640625, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04890192672610283, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.65381622314453, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03935280069708824, "pnorm/_forward_module.model.norm.weight": 28.297029495239258, "gnorm/_forward_module.model.norm.weight": 0.007687462959438562, "pnorm/_forward_module.lm_head.weight": 210.59397888183594, "gnorm/_forward_module.lm_head.weight": 0.08222661912441254} +{"step": 817889280, "pnorm/_forward_module.model.embeddings.weight": 137.15567016601562, "gnorm/_forward_module.model.embeddings.weight": 0.07306475192308426, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.911684036254883, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0029923662077635527, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.572299003601074, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007676415611058474, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.373515129089355, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009735281579196453, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.138505935668945, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09928611665964127, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.10787296295166, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.09428305178880692, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0419728755950928, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006082577630877495, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.2869444787502289, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0004130627494305372, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.21906852722168, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001391372294165194, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.08026885986328, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.053593724966049194, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.168737411499023, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.057493679225444794, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.38874053955078, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0019823594484478235, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.871614456176758, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.008674097247421741, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.749263763427734, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.011728547513484955, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.470274925231934, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06578321754932404, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.54985237121582, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05714655667543411, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.271392583847046, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009599674493074417, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1457970291376114, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010318869026377797, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.3531436920166, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0011097453534603119, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.412086486816406, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04084980860352516, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.387971878051758, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03246660158038139, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.652427673339844, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0015836319653317332, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.4940767288208, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009547404013574123, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.137714385986328, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.013843519613146782, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.77287483215332, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05532437562942505, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.914285659790039, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.041457608342170715, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.8592519760131836, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007080974522978067, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1802183836698532, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0005010329186916351, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.53291130065918, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001173767144791782, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.00729751586914, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03921736031770706, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.7824764251709, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0325540266931057, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.955636978149414, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001469228183850646, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.342194557189941, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006825422868132591, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.880573272705078, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.010694421827793121, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.091304779052734, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.052316002547740936, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.341573715209961, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03867855295538902, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.6573097705841064, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006216119509190321, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1481006145477295, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005404132534749806, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.4321346282959, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002337028505280614, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.065975189208984, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04865556210279465, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.723478317260742, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04619142785668373, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.87859535217285, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.005220006685703993, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.78917407989502, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.03685825690627098, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.082464218139648, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.051995955407619476, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.942051887512207, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.10951948910951614, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.077922821044922, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02899787947535515, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.596938371658325, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.00681950943544507, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1463233381509781, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0006562539492733777, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.227378845214844, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009639169438742101, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.913646697998047, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03146327659487724, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.767486572265625, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02676074579358101, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.143016815185547, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0014917904045432806, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.433774948120117, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.009250866249203682, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.964123725891113, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.013757641427218914, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.37610149383545, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.040997765958309174, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 11.966365814208984, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.02159370295703411, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.9272210597991943, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005521716084331274, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.18270361423492432, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0008046979201026261, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.02057647705078, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0017612571828067303, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.072376251220703, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.034296926110982895, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.69012451171875, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.029622385278344154, "pnorm/_forward_module.model.norm.weight": 28.418928146362305, "gnorm/_forward_module.model.norm.weight": 0.005179626867175102, "pnorm/_forward_module.lm_head.weight": 211.81265258789062, "gnorm/_forward_module.lm_head.weight": 0.05952082946896553} +{"step": 838860800, "pnorm/_forward_module.model.embeddings.weight": 137.48036193847656, "gnorm/_forward_module.model.embeddings.weight": 0.09313131868839264, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.904102325439453, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0038445978425443172, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.611473083496094, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009492487646639347, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.407427787780762, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012324853800237179, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.124113082885742, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.1301378756761551, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.093955993652344, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.12265733629465103, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0434606075286865, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008068287745118141, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.28935256600379944, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.000808750803116709, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.20477867126465, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0019600018858909607, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.0754451751709, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.07178672403097153, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.166736602783203, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08670590817928314, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.38014030456543, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0025244757998734713, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.88676643371582, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.009810651652514935, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.760666847229004, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.014785516075789928, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.45954418182373, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.09047947078943253, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.539627075195312, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0755116418004036, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2792694568634033, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.013937395997345448, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1469656080007553, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0018349724123254418, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.351612091064453, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001438191975466907, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.42933464050293, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05550767481327057, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.400068283081055, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.046776220202445984, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.652629852294922, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.002020950196310878, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.514494895935059, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010147780179977417, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.151631355285645, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.015554082579910755, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.771052360534668, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07916654646396637, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.913415908813477, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05663143843412399, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.874633312225342, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007942156866192818, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.18169501423835754, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007079153438098729, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.534521102905273, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0014439038932323456, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.030336380004883, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05308162793517113, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.800739288330078, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04778396338224411, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.967052459716797, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0019272251520305872, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.401864051818848, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008206233382225037, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.929579734802246, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.013249550946056843, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.095524787902832, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.07447800040245056, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.349016189575195, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.054812002927064896, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.66736102104187, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.007930290885269642, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.14861464500427246, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008915287908166647, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.422279357910156, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002310918876901269, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.05714225769043, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06057966500520706, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.718547821044922, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.07826043665409088, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.892946243286133, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.008712111972272396, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.828168869018555, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.029701588675379753, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.114505767822266, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.04988124221563339, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.964015007019043, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.20141983032226562, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.133099555969238, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.042315687984228134, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6013293266296387, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.008064966648817062, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14660102128982544, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0009609289700165391, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.235692977905273, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.001311396132223308, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.953371047973633, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04444218799471855, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.794160842895508, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.037128567695617676, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.163787841796875, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0023145084269344807, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.480284690856934, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.014406885951757431, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 12.99943733215332, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.020625775679945946, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.402484893798828, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0665602907538414, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.013307571411133, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.031850580126047134, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.941514492034912, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.01993757300078869, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.18392597138881683, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.002546690870076418, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.034284591674805, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.001917787827551365, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.129621505737305, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.04730075225234032, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.72633171081543, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03783448040485382, "pnorm/_forward_module.model.norm.weight": 28.54103660583496, "gnorm/_forward_module.model.norm.weight": 0.004493799060583115, "pnorm/_forward_module.lm_head.weight": 212.9751739501953, "gnorm/_forward_module.lm_head.weight": 0.07397575676441193} +{"step": 859832320, "pnorm/_forward_module.model.embeddings.weight": 137.78717041015625, "gnorm/_forward_module.model.embeddings.weight": 0.07790575176477432, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.898059844970703, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0028329272754490376, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.650580406188965, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010270954109728336, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.440414428710938, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012349704280495644, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.111501693725586, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10177499055862427, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.082037925720215, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10354442149400711, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0450613498687744, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.009072750806808472, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.2918230891227722, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0017048836452886462, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.19234848022461, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0018490381771698594, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.07198715209961, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.056418292224407196, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.165573120117188, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06694705784320831, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.373323440551758, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002049371600151062, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.905255317687988, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007867963053286076, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.774893760681152, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.011686655692756176, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.449787139892578, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06882378458976746, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.530340194702148, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06434841454029083, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2873880863189697, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009577876888215542, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1481987088918686, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009587508393451571, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35097312927246, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001249333145096898, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.44635009765625, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.045225050300359726, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.412385940551758, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03806111589074135, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.653186798095703, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0017880358500406146, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.537456512451172, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.01044592447578907, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.16769790649414, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01672854833304882, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.768730163574219, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.058966685086488724, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.912076950073242, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04515902325510979, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.8876445293426514, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.012124202214181423, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1828436702489853, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0013642574194818735, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.535856246948242, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0016433449927717447, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.0516300201416, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.043468303978443146, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.8176212310791, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03748737648129463, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.97846221923828, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0016480914782732725, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.458708763122559, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006852276623249054, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 12.976369857788086, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009503553621470928, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.099708557128906, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05512210726737976, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.356176376342773, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.042836956679821014, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.6783370971679688, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006694842129945755, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.14942270517349243, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.000580135325435549, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.41396713256836, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.005634963978081942, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.0504150390625, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.07492616027593613, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.714599609375, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06694575399160385, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.904212951660156, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.010878123342990875, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.863441467285156, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.08675549924373627, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.143143653869629, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.12792693078517914, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 11.983341217041016, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.12309857457876205, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.180700302124023, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03138117119669914, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6056783199310303, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.010836937464773655, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14692369103431702, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0019584959372878075, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.242874145507812, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0010223044082522392, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 29.989566802978516, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03304382413625717, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.818445205688477, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.027277396991848946, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.183143615722656, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0013851848198100924, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.520170211791992, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.007553393952548504, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.029318809509277, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.012782802805304527, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.428756713867188, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.043019574135541916, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.059635162353516, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.023323651403188705, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.9574320316314697, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.004945255815982819, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.18545708060264587, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.000463858712464571, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.04660415649414, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0015706942649558187, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.18092155456543, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.035254400223493576, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.758848190307617, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02908034808933735, "pnorm/_forward_module.model.norm.weight": 28.657909393310547, "gnorm/_forward_module.model.norm.weight": 0.006206231191754341, "pnorm/_forward_module.lm_head.weight": 214.06825256347656, "gnorm/_forward_module.lm_head.weight": 0.06830868124961853} +{"step": 880803840, "pnorm/_forward_module.model.embeddings.weight": 138.0771942138672, "gnorm/_forward_module.model.embeddings.weight": 0.07754462212324142, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.890871047973633, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002969298278912902, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.687477111816406, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.009073873050510883, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.47184944152832, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.010606960393488407, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.097187995910645, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10125336796045303, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.068337440490723, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10308533161878586, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0472774505615234, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008020934648811817, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.29391297698020935, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0014815657632425427, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.178890228271484, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0014439505757763982, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.06619644165039, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05621187761425972, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.162906646728516, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06921005249023438, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.36644172668457, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0019526705145835876, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.921900749206543, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.008567255921661854, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.78735637664795, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.012029530480504036, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.440206527709961, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.0717366635799408, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.521330833435059, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0635373666882515, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.2920939922332764, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010416864417493343, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.14897780120372772, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010864452924579382, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.349994659423828, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0015165774384513497, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.461584091186523, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0470227375626564, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.42327308654785, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04030859097838402, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.653400421142578, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0024126956705003977, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.557628631591797, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.013639801181852818, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.182415962219238, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.02259022928774357, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.766500473022461, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06299328058958054, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.91079044342041, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04650496318936348, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.8965961933135986, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.010797441937029362, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.18371102213859558, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008548683836124837, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.537261962890625, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001784618361853063, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.071964263916016, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04673202708363533, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.83412742614746, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03996175155043602, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.989818572998047, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0017111891647800803, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.512073516845703, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.007782041560858488, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.019919395446777, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.010800608433783054, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.104558944702148, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06054375693202019, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.363916397094727, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04633874073624611, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.689622640609741, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.007139412686228752, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15011364221572876, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007528822752647102, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.405494689941406, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.006427795626223087, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.041872024536133, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08871942013502121, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.709300994873047, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.07311833649873734, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.915834426879883, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.011619100347161293, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.899264335632324, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.103671595454216, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.172654151916504, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.15635330975055695, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.003082275390625, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.1267338991165161, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.229049682617188, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.033501673489809036, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.612168073654175, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.017327819019556046, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14747312664985657, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0032758451998233795, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.250064849853516, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0010450172703713179, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.02389907836914, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.034848641604185104, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.84201431274414, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.029173744842410088, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.204444885253906, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001582830329425633, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.562796592712402, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00954193715006113, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.06179428100586, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.015483209863305092, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.455950736999512, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.048251863569021225, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.106938362121582, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.025236256420612335, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.9697415828704834, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.008209239691495895, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.18652121722698212, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0011435297783464193, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.058616638183594, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0017791048157960176, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.229463577270508, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.040095873177051544, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.789758682250977, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.033052071928977966, "pnorm/_forward_module.model.norm.weight": 28.77206802368164, "gnorm/_forward_module.model.norm.weight": 0.006384439300745726, "pnorm/_forward_module.lm_head.weight": 215.10980224609375, "gnorm/_forward_module.lm_head.weight": 0.06594003736972809} +{"step": 901775360, "pnorm/_forward_module.model.embeddings.weight": 138.3507537841797, "gnorm/_forward_module.model.embeddings.weight": 0.07962066680192947, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.884685516357422, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003393898718059063, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.722733497619629, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008997690863907337, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.501851081848145, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.010751748457551003, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.084761619567871, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11401937156915665, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.056323051452637, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10906094312667847, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.050029993057251, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.007609283551573753, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.29607462882995605, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0005226320354267955, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.16652488708496, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0016849333187565207, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.061954498291016, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06175040453672409, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.16067886352539, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07130613178014755, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.35801124572754, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.002139550633728504, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.935014724731445, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00838969275355339, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.797394752502441, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.014542078599333763, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.429153442382812, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.07660187035799026, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.510754585266113, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06591671705245972, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.298287868499756, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010088038630783558, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.14998693764209747, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006047643255442381, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.349336624145508, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010848947567865252, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.47783851623535, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04561617597937584, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.43431854248047, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.039701882749795914, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.652725219726562, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.001762392115779221, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.577519416809082, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009919407777488232, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.196560859680176, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.015906114131212234, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.762578964233398, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06347990781068802, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.90758991241455, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.0470866784453392, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9039008617401123, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008879845961928368, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1842811405658722, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000620411999989301, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.53880500793457, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0013260712148621678, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.091814041137695, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04767102748155594, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.850297927856445, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.039937350898981094, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 22.998720169067383, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0018637333996593952, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.563468933105469, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00731686782091856, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.061847686767578, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01148783229291439, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.10639762878418, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06562943011522293, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.368574142456055, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04693746939301491, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.696267604827881, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006115013733506203, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1503981202840805, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007711807847954333, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.39792823791504, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0022575175389647484, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.034936904907227, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.0568813756108284, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.705678939819336, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06595637649297714, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.926895141601562, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0070278942584991455, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.933919906616211, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.035525571554899216, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.20022201538086, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.05504473298788071, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.021612167358398, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.1563316285610199, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.27539348602295, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.0368221178650856, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6145365238189697, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.007754233665764332, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14768607914447784, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0008954785880632699, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.25664710998535, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.00115203857421875, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.055824279785156, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.038664184510707855, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.864500045776367, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03388933837413788, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.225255966186523, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0019938009791076183, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.604026794433594, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.014227744191884995, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.091958045959473, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.022390512749552727, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.482406616210938, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.05683751031756401, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.152392387390137, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.029256543144583702, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.9851722717285156, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.011692659929394722, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.18791569769382477, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0015655255410820246, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.071069717407227, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.003198722843080759, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.278606414794922, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05507264658808708, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.820466995239258, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.043784767389297485, "pnorm/_forward_module.model.norm.weight": 28.88334846496582, "gnorm/_forward_module.model.norm.weight": 0.00421614246442914, "pnorm/_forward_module.lm_head.weight": 216.08831787109375, "gnorm/_forward_module.lm_head.weight": 0.08744513243436813} +{"step": 922746880, "pnorm/_forward_module.model.embeddings.weight": 138.60733032226562, "gnorm/_forward_module.model.embeddings.weight": 0.08322753757238388, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.879640579223633, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003111126134172082, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.760520935058594, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.011305867694318295, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.534171104431152, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.014675194397568703, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.073094367980957, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11106320470571518, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.045234680175781, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11165151000022888, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0524580478668213, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.011733824387192726, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.29827117919921875, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001755175762809813, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.1544132232666, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001771178562194109, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.058053970336914, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06274783611297607, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.158641815185547, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08033251762390137, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.350727081298828, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0021980642341077328, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.949515342712402, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.008747179061174393, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.80878734588623, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.013611843809485435, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.41847038269043, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08018230646848679, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.500429153442383, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.07108448445796967, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3065409660339355, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010188620537519455, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15118350088596344, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.000924665539059788, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.348684310913086, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001344715477898717, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.492963790893555, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04858439788222313, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.444839477539062, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.045708067715168, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.652889251708984, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0019817634020000696, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.597678184509277, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.01216124091297388, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.210389137268066, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01919635944068432, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.759647369384766, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06971848011016846, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.905094146728516, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.05237172171473503, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.911529064178467, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0074843461625278, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.18490399420261383, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007115071057341993, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.539724349975586, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.002168291946873069, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.109777450561523, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05195530876517296, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.86505699157715, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0480070561170578, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.010297775268555, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0021961042657494545, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.618274688720703, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0077175572514534, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.107499122619629, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.011345582082867622, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.110452651977539, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.07448416948318481, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.375205039978027, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.05388481914997101, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7069060802459717, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006850524339824915, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1512531340122223, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008338516345247626, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.388946533203125, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.010369345545768738, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.02493667602539, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.13141953945159912, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.70037269592285, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.10383348166942596, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.93909454345703, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.018692364916205406, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 13.96845817565918, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.16226662695407867, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.2284574508667, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.24402998387813568, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.039422988891602, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.15160495042800903, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.319926261901855, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.040018342435359955, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6171517372131348, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.025252051651477814, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14780670404434204, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.004819393157958984, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.26189613342285, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0013223597779870033, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.084327697753906, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.04251554235816002, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.884870529174805, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.038249291479587555, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.243213653564453, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0028273393400013447, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.641621589660645, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.02764587104320526, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.118905067443848, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.040406979620456696, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.507889747619629, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0674942135810852, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.196720123291016, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.03240145370364189, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 2.9991023540496826, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.04004433751106262, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1890658140182495, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.005457755643874407, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.081066131591797, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0034475282300263643, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.320173263549805, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05643065646290779, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.847640991210938, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.04473719373345375, "pnorm/_forward_module.model.norm.weight": 28.990018844604492, "gnorm/_forward_module.model.norm.weight": 0.005949206650257111, "pnorm/_forward_module.lm_head.weight": 217.01785278320312, "gnorm/_forward_module.lm_head.weight": 0.07953161001205444} +{"step": 943718400, "pnorm/_forward_module.model.embeddings.weight": 138.84921264648438, "gnorm/_forward_module.model.embeddings.weight": 0.07624714821577072, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.875213623046875, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0031460104510188103, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.798386573791504, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.010196392424404621, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.56687068939209, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.012099139392375946, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.061956405639648, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.10841397196054459, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.034480094909668, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10437111556529999, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0534472465515137, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.00916266068816185, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3003646731376648, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0017530412878841162, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.143449783325195, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0020874300971627235, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.054874420166016, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06170782074332237, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.15726089477539, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07001261413097382, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.34522819519043, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0020450351294130087, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.967585563659668, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.008602139540016651, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.82288646697998, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.012607133015990257, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.409636497497559, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.07303415238857269, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.491878509521484, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06264852732419968, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3118090629577637, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011371572501957417, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1521574854850769, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009651107247918844, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.34918975830078, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0011919132666662335, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.508615493774414, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.044933196157217026, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.455951690673828, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03935163468122482, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.654170989990234, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0016324358293786645, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.618616104125977, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.00976874865591526, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.225708961486816, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.013661352917551994, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.757463455200195, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06335407495498657, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.903708457946777, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.045463770627975464, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.920682191848755, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007150215562433004, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1858544498682022, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006556467269547284, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.54138946533203, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0016074825543910265, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.12820816040039, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.0451648011803627, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.880126953125, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03980864956974983, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.020584106445312, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0017151037463918328, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.665728569030762, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.007428342010825872, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.147782325744629, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.011506403796374798, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.114429473876953, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.060709718614816666, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.38184642791748, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04445919021964073, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7189066410064697, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006058309692889452, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15211966633796692, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007163059781305492, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.382266998291016, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.005861154291778803, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.019508361816406, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08036880195140839, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.69823455810547, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.07608520984649658, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.948331832885742, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.010756433941423893, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.000246047973633, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.09124147891998291, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.254637718200684, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.13794298470020294, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.055002212524414, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.14676719903945923, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.356669425964355, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03307241573929787, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6211512088775635, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.014119943603873253, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14827148616313934, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0025893948040902615, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.266157150268555, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0012068506330251694, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.1107234954834, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03442693501710892, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.903644561767578, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.028321463614702225, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.261348724365234, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0016691071214154363, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.680490493774414, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.009269801899790764, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.147109031677246, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.013604335486888885, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.53183650970459, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04353237897157669, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.2379789352417, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.024037575349211693, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.0096969604492188, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.01194051280617714, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.18992476165294647, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0013074303278699517, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.091339111328125, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0017243754118680954, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.36115074157715, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.0374210923910141, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.874229431152344, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.031261712312698364, "pnorm/_forward_module.model.norm.weight": 29.09408187866211, "gnorm/_forward_module.model.norm.weight": 0.005774033721536398, "pnorm/_forward_module.lm_head.weight": 217.89305114746094, "gnorm/_forward_module.lm_head.weight": 0.05785857141017914} +{"step": 964689920, "pnorm/_forward_module.model.embeddings.weight": 139.07591247558594, "gnorm/_forward_module.model.embeddings.weight": 0.07028784602880478, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.87076187133789, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0027845825534313917, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.834098815917969, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007678710389882326, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.597390174865723, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.00864249374717474, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.051107406616211, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09591658413410187, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.023979187011719, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.09282176941633224, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0551347732543945, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.00551924854516983, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.30247342586517334, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006491534877568483, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.133930206298828, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0016389281954616308, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.05354881286621, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05343392863869667, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.156158447265625, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06181265786290169, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.33982276916504, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0015886547043919563, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.98274040222168, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007887713611125946, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.83408260345459, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.010775907896459103, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.401453971862793, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06305572390556335, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.483895301818848, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05757890269160271, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.319146156311035, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008875452913343906, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15314261615276337, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006941997562535107, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.34961700439453, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010493416339159012, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.52386474609375, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.041883330792188644, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.466291427612305, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03540477529168129, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.655302047729492, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0014208590146154165, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.639963150024414, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008231588639318943, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.240742683410645, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.012410277500748634, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.754780769348145, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05363842099905014, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.901590347290039, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04165550321340561, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9327147006988525, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008343932218849659, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1869235634803772, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000678669661283493, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.542905807495117, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0012342343106865883, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.145462036132812, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.041036248207092285, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.894533157348633, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.035484977066516876, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.030536651611328, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0014286070363596082, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.711682319641113, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006647253874689341, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.186254501342773, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009806735441088676, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.11785888671875, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05166175216436386, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.388224601745605, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03994278982281685, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7296149730682373, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005646468605846167, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15282922983169556, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0004459668416529894, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.37553596496582, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002794423373416066, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.013690948486328, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05249432101845741, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.69529151916504, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06250771135091782, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.958187103271484, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.00719974422827363, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.03244686126709, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.0464463047683239, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.281266212463379, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.07107429206371307, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.069811820983887, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.13490496575832367, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.392578125, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.030409960076212883, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6252400875091553, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.007678812835365534, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1485067456960678, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0011455094208940864, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.27158546447754, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0010899900225922465, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.1375789642334, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.031992290169000626, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.92276382446289, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.027345510199666023, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.278623580932617, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0014427416026592255, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.714934349060059, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008447523228824139, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.172462463378906, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.014718485064804554, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.554821968078613, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.041622892022132874, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.277848243713379, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.022308126091957092, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.0209076404571533, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007975267246365547, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1910637617111206, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0011165319010615349, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.10101890563965, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0014861759264022112, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.401168823242188, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.033463478088378906, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.899372100830078, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02892688848078251, "pnorm/_forward_module.model.norm.weight": 29.194299697875977, "gnorm/_forward_module.model.norm.weight": 0.0040795705281198025, "pnorm/_forward_module.lm_head.weight": 218.72731018066406, "gnorm/_forward_module.lm_head.weight": 0.05521472543478012} +{"step": 985661440, "pnorm/_forward_module.model.embeddings.weight": 139.2886199951172, "gnorm/_forward_module.model.embeddings.weight": 0.07668391615152359, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.865022659301758, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0032003733795136213, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.865357398986816, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008960888721048832, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.624307632446289, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.010302674025297165, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.038922309875488, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11064145714044571, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 11.012149810791016, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.10355636477470398, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0570433139801025, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.007219790946692228, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.30455416440963745, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0013168640434741974, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.123552322387695, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0015921760350465775, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.0497989654541, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05815395340323448, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.154268264770508, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06552004814147949, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.333303451538086, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0020972895435988903, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 11.994985580444336, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00859068799763918, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.843391418457031, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.013676229864358902, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.391966819763184, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.07682494819164276, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.474723815917969, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06329452246427536, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3264646530151367, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010651466436684132, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15409380197525024, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.001258069067262113, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.350027084350586, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0013605802087113261, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.53759002685547, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04636695235967636, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.475919723510742, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03751422092318535, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.656238555908203, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0021541633177548647, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.659721374511719, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012756227515637875, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.254344940185547, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.020817425101995468, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.752202033996582, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06748413294553757, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.899840354919434, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04530300945043564, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9401068687438965, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008123235777020454, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.18742074072360992, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006395771633833647, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.545907974243164, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001320989802479744, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.16341781616211, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04423747956752777, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.909122467041016, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03530231490731239, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.040157318115234, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0016309478087350726, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.75893497467041, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006875166669487953, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.22574234008789, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.010927427560091019, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.120126724243164, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05977539345622063, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.393068313598633, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04201560840010643, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7401058673858643, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006810539402067661, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15362486243247986, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007151199970394373, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.36897087097168, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004183096811175346, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 28.007558822631836, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06222745403647423, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.69244956970215, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.059867363423109055, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.96744728088379, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.008287934586405754, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.061335563659668, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.06021680682897568, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.30449104309082, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.08536843955516815, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.083989143371582, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.12364578992128372, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.428215026855469, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03195783123373985, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6288344860076904, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.010638413019478321, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14880721271038055, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0018912700470536947, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.27546501159668, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009797454113140702, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.160137176513672, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03304336220026016, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.939476013183594, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.028942806646227837, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.298486709594727, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0014759199693799019, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.75019645690918, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008108648471534252, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.197945594787598, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.013084894977509975, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.579996109008789, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04238774627447128, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.321454048156738, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.02330756187438965, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.0293281078338623, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.010195871815085411, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19167649745941162, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0014712996780872345, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.111074447631836, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012838125694543123, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.439342498779297, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03278152644634247, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.92447280883789, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.029324201866984367, "pnorm/_forward_module.model.norm.weight": 29.293825149536133, "gnorm/_forward_module.model.norm.weight": 0.005233574192970991, "pnorm/_forward_module.lm_head.weight": 219.51307678222656, "gnorm/_forward_module.lm_head.weight": 0.05025411397218704} +{"step": 1006632960, "pnorm/_forward_module.model.embeddings.weight": 139.487548828125, "gnorm/_forward_module.model.embeddings.weight": 0.08202654123306274, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.858976364135742, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.003164871595799923, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.895506858825684, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007995068095624447, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.65054988861084, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009360807947814465, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.026385307312012, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11553651839494705, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.999734878540039, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11148514598608017, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0582196712493896, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0063555436208844185, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3063834607601166, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006627185503020883, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.11319923400879, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001705798669718206, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.045583724975586, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06261274963617325, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.152156829833984, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.08009245246648788, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.327375411987305, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0022847603540867567, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.007720947265625, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.008796798065304756, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.852749824523926, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.012606353498995304, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.382744789123535, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08068276941776276, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.465686798095703, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06942670047283173, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3324317932128906, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011321838945150375, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1550237536430359, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008850694284774363, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35182762145996, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0014828506391495466, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.55220603942871, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05175565928220749, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.48614501953125, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.045449331402778625, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.656688690185547, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0023735351860523224, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.677657127380371, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.016884658485651016, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.267401695251465, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.02776111476123333, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.74897575378418, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.0697375237941742, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.897050857543945, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04853399097919464, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9460785388946533, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.009139111265540123, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.18808352947235107, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0012117435690015554, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.54708480834961, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001501772552728653, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.17810821533203, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.05009433254599571, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.921483993530273, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.045857399702072144, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.04958152770996, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0017990035703405738, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.804372787475586, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008582771755754948, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.264162063598633, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.012497692368924618, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.12248706817627, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06616392731666565, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.398228645324707, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04907340556383133, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7481887340545654, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.009863453917205334, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15420614182949066, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00133492739405483, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.361860275268555, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0065680802799761295, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.998058319091797, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.09098481386899948, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.687936782836914, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.09313787519931793, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.976865768432617, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.012751942500472069, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.089095115661621, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.09690000861883163, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.326981544494629, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.1498832106590271, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.09833812713623, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.17308129370212555, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.464064598083496, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.037693511694669724, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6324639320373535, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.015360482037067413, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1491069346666336, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0025777288246899843, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.2786865234375, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0011260703904554248, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.180788040161133, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03884003311395645, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.955284118652344, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.03283848240971565, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.318437576293945, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001918974332511425, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.784006118774414, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.017630575224757195, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.223045349121094, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.02662418596446514, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.605544090270996, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.058288127183914185, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.365303039550781, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.029117580503225327, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.0392916202545166, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.027073437348008156, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19249887764453888, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0035678015556186438, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.12173843383789, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0031394329853355885, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.477251052856445, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.05264545977115631, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.949861526489258, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.044774238020181656, "pnorm/_forward_module.model.norm.weight": 29.391998291015625, "gnorm/_forward_module.model.norm.weight": 0.00413890415802598, "pnorm/_forward_module.lm_head.weight": 220.2489471435547, "gnorm/_forward_module.lm_head.weight": 0.0693630799651146} +{"step": 1027604480, "pnorm/_forward_module.model.embeddings.weight": 139.6724090576172, "gnorm/_forward_module.model.embeddings.weight": 0.06637056171894073, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.85521125793457, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0026198329869657755, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.928074836730957, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.00786060094833374, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.678533554077148, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009697912260890007, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.016483306884766, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08897929638624191, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.99013614654541, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08761727064847946, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0597028732299805, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006456926930695772, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.30845749378204346, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006942301406525075, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.104955673217773, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001432037097401917, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.044157028198242, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0469973161816597, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.151172637939453, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05353163182735443, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.322050094604492, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0016109725693240762, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.022160530090332, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.008073038421571255, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.863899230957031, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.010826574638485909, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.374053955078125, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.061457615345716476, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.457193374633789, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05669055134057999, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.336965322494507, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009808522649109364, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1557633876800537, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.000804198207333684, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35203742980957, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010569911682978272, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.563823699951172, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04054016247391701, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.494369506835938, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03257935121655464, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.657363891601562, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0016886218218132854, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.69518756866455, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011359715834259987, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.280423164367676, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01869390346109867, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.746081352233887, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.052904535084962845, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.894645690917969, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04003902152180672, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9523768424987793, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008204164914786816, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.18876434862613678, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009876098483800888, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.549205780029297, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001096532680094242, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.19305419921875, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03915577009320259, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.93367576599121, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.030995361506938934, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.05843162536621, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0013426964869722724, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.846090316772461, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006014530546963215, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.299057006835938, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009035948663949966, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.124900817871094, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.04838737100362778, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.403033256530762, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03747650608420372, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.756225109100342, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006279331166297197, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.154885396361351, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007857793243601918, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.3558349609375, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002433867659419775, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.99136734008789, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04852188006043434, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.685016632080078, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.0504339262843132, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.985698699951172, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0053016506135463715, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.116652488708496, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.040092941373586655, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.348920822143555, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.05807730183005333, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.110468864440918, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.10508173704147339, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.49388313293457, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.028480324894189835, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6360809803009033, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.006032820791006088, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14939263463020325, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0005252675036899745, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.281238555908203, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009190381970256567, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.199501037597656, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.030934356153011322, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.96921730041504, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.026251111179590225, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.335908889770508, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0012471135705709457, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.814128875732422, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008840913884341717, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.244834899902344, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.014404053799808025, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.629244804382324, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0355646014213562, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.40571403503418, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.020740680396556854, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.0484771728515625, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.015096310526132584, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19318434596061707, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0019023859640583396, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.129314422607422, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0013083693338558078, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.5081787109375, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03108375333249569, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.971071243286133, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.028177892789244652, "pnorm/_forward_module.model.norm.weight": 29.48297882080078, "gnorm/_forward_module.model.norm.weight": 0.0033662368077784777, "pnorm/_forward_module.lm_head.weight": 220.94615173339844, "gnorm/_forward_module.lm_head.weight": 0.049628376960754395} +{"step": 1048576000, "pnorm/_forward_module.model.embeddings.weight": 139.84425354003906, "gnorm/_forward_module.model.embeddings.weight": 0.06924313306808472, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.849884033203125, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002932822797447443, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.957304000854492, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008671816438436508, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.704094886779785, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.010107116773724556, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 11.005009651184082, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09690851718187332, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.979009628295898, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.09618823230266571, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0617151260375977, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.008423103019595146, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.31050339341163635, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0014706659130752087, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.095823287963867, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0016583887627348304, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.04048728942871, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05127527937293053, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.14910888671875, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0640282928943634, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.316686630249023, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0018589666578918695, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.034772872924805, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007925018668174744, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.873258590698242, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.010894826613366604, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.365485191345215, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06369946897029877, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.448673248291016, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06030477210879326, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.342480182647705, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009126916527748108, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15668606758117676, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006818855181336403, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.353500366210938, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0012267276179045439, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.57660484313965, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04333744943141937, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.50357437133789, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03876384720206261, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.65894889831543, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0016904508229345083, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.712225914001465, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010549033991992474, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.292105674743652, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.016025898978114128, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.74414348602295, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05743277072906494, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.893017768859863, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.0441085621714592, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.959207773208618, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008659404702484608, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1893964409828186, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009225074900314212, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.551597595214844, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001777914701960981, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.207677841186523, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.045367415994405746, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.946067810058594, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.040166568011045456, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.067049026489258, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0017212328966706991, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.887012481689453, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.007608877960592508, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.333354949951172, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.010842240415513515, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.127347946166992, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.0594380646944046, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.40748405456543, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.045160941779613495, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7619638442993164, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006605206523090601, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15545381605625153, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006917466525919735, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.350088119506836, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.008929718285799026, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.98515510559082, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.10872479528188705, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.682205200195312, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.087520070374012, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.993759155273438, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.015413915738463402, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.144143104553223, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.136297345161438, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.37141227722168, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.20360393822193146, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.122781753540039, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.09911473840475082, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.524133682250977, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.031149419024586678, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.639110565185547, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.018066557124257088, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14967048168182373, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0034967479296028614, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.283578872680664, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0011935734655708075, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.21709442138672, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03468415513634682, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.982534408569336, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02825179696083069, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.35418701171875, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0015224748058244586, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.84285831451416, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.010665920563042164, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.265986442565918, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.017186380922794342, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.653043746948242, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04091382399201393, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.446565628051758, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.02193254977464676, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.054612159729004, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0109529634937644, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19377201795578003, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0014103680150583386, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.137601852416992, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0016478314064443111, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.53940200805664, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.035671498626470566, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 20.992122650146484, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03232651576399803, "pnorm/_forward_module.model.norm.weight": 29.573715209960938, "gnorm/_forward_module.model.norm.weight": 0.005862156394869089, "pnorm/_forward_module.lm_head.weight": 221.61390686035156, "gnorm/_forward_module.lm_head.weight": 0.06280854344367981} +{"step": 1069547520, "pnorm/_forward_module.model.embeddings.weight": 140.00381469726562, "gnorm/_forward_module.model.embeddings.weight": 0.059206489473581314, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.84528350830078, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0021452675573527813, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 14.986587524414062, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007748179137706757, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.729864120483398, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008741500787436962, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.994086265563965, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.07807821780443192, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.968427658081055, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.07737798243761063, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.063023090362549, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005542707163840532, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.31211745738983154, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006506852223537862, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.08713722229004, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0011353417066857219, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.037187576293945, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.042963698506355286, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.14740562438965, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.04417276009917259, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.31197166442871, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.001315577421337366, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.047471046447754, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006785301491618156, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.883245468139648, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009517776779830456, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.357666969299316, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.05220552533864975, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.441051483154297, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04720500856637955, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3459794521331787, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0085243945941329, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15711236000061035, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010079871863126755, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35409927368164, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0009226043475791812, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.587289810180664, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03480108082294464, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.511329650878906, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02752545289695263, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.65998077392578, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0012771766632795334, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.729515075683594, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007674704305827618, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.304014205932617, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.011065463535487652, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.741445541381836, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.043221235275268555, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.890974044799805, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.033897269517183304, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9626033306121826, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.006020127795636654, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1895192563533783, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0003803552535828203, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.553688049316406, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0009518302977085114, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.221162796020508, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03357497602701187, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.957557678222656, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.027228349819779396, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.07465934753418, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0011154384119436145, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.923624992370605, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005617361515760422, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.364293098449707, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008059922605752945, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.129302024841309, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03908143937587738, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.411881446838379, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03187764063477516, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7677037715911865, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004308291245251894, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15588678419589996, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002910517214331776, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.345340728759766, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0023590964265167713, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.979209899902344, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04427013546228409, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.679227828979492, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04347294569015503, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 23.999900817871094, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.00519893504679203, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.167470932006836, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.040054526180028915, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.390833854675293, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.05669739469885826, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.134101867675781, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.08577784150838852, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.551108360290527, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.025900116190314293, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6422805786132812, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.006039837840944529, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.14992286264896393, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0007434178842231631, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.28639030456543, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0008732025744393468, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.234466552734375, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.028394581750035286, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 20.995880126953125, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.025867855176329613, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.372745513916016, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0014141695573925972, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.872519493103027, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.013680456206202507, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.287510871887207, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.022973693907260895, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.67602252960205, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03291356563568115, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.486618995666504, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.019865863025188446, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.061497211456299, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.026225514709949493, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19451096653938293, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.003476930083706975, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.146425247192383, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0013448139652609825, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.5703182220459, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.030642878264188766, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.013200759887695, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.026525191962718964, "pnorm/_forward_module.model.norm.weight": 29.662708282470703, "gnorm/_forward_module.model.norm.weight": 0.0033500581048429012, "pnorm/_forward_module.lm_head.weight": 222.24252319335938, "gnorm/_forward_module.lm_head.weight": 0.05115242302417755} +{"step": 1090519040, "pnorm/_forward_module.model.embeddings.weight": 140.15133666992188, "gnorm/_forward_module.model.embeddings.weight": 0.07922381907701492, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.841238021850586, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0031614152248948812, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.015802383422852, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008203941397368908, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.755172729492188, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.010147161781787872, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.983978271484375, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.11816585063934326, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.958621978759766, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.11099910736083984, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.064547300338745, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006918351165950298, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3139130175113678, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0010512734297662973, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.078943252563477, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0017766974633559585, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.034521102905273, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.06385936588048935, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.145845413208008, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.07658208906650543, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.306766510009766, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0019541517831385136, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.059471130371094, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.008407048881053925, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.892735481262207, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.011816577985882759, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.349262237548828, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.08215217292308807, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.433116912841797, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06828607618808746, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.349006414413452, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009299427270889282, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15771782398223877, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007032426074147224, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35382652282715, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0014264181954786181, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.596126556396484, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.05161890387535095, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.51765251159668, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04407934844493866, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.660720825195312, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.002187730511650443, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.743623733520508, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012376836501061916, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.313630104064941, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.017311103641986847, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.738736152648926, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.07231032848358154, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.888677597045898, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04724093899130821, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.966099262237549, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.01121920719742775, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.18980522453784943, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009647986735217273, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.55464744567871, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0016695949016138911, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.23238754272461, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04974454641342163, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.96778678894043, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04319551959633827, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.082637786865234, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0020019873045384884, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.95877742767334, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.008429250679910183, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.394302368164062, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.012955213896930218, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.131775856018066, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.06877273321151733, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.416582107543945, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04573037475347519, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7740440368652344, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.008134991861879826, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1565076857805252, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007290109060704708, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.339275360107422, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.006424775812774897, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.970130920410156, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08891242742538452, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.67437171936035, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.08755398541688919, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.007587432861328, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.011972494423389435, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.19236946105957, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.10034631937742233, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.41141128540039, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.15714915096759796, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.146242141723633, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.1556464433670044, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.580925941467285, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03478928655385971, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6434123516082764, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.014257240109145641, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15001176297664642, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002584763802587986, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.288318634033203, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0011585642350837588, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.249711990356445, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03543228656053543, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.008056640625, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.029567334800958633, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.39065170288086, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0014812819426879287, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.901124954223633, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.007057493552565575, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.308096885681152, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.011616470292210579, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.69852352142334, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04603620991110802, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.524603843688965, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.025378182530403137, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.069857597351074, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.004654102958738804, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1951240599155426, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0005629057995975018, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.15358543395996, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0018903726013377309, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.597644805908203, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03797883167862892, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.031841278076172, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03261088207364082, "pnorm/_forward_module.model.norm.weight": 29.747772216796875, "gnorm/_forward_module.model.norm.weight": 0.00526442751288414, "pnorm/_forward_module.lm_head.weight": 222.83094787597656, "gnorm/_forward_module.lm_head.weight": 0.05147223919630051} +{"step": 1111490560, "pnorm/_forward_module.model.embeddings.weight": 140.2875213623047, "gnorm/_forward_module.model.embeddings.weight": 0.07013580948114395, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.837039947509766, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0025237819645553827, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.044673919677734, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008316789753735065, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.780292510986328, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009410723112523556, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.973408699035645, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09586941450834274, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.948168754577637, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.09560512006282806, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0659072399139404, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006793656852096319, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.31534314155578613, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001306528109125793, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.071001052856445, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0017664078623056412, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.031444549560547, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05343272164463997, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.143573760986328, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.06548408418893814, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.302040100097656, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0017919084057211876, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.071805000305176, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.009193847887217999, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.90208911895752, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.011363324709236622, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.341365814208984, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06560688465833664, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.42529296875, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.06156325712800026, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3528308868408203, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.011203402653336525, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15824754536151886, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0013600373640656471, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35552978515625, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0012606850359588861, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.60682487487793, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.046089235693216324, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.525503158569336, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.04002489894628525, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.66216278076172, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0024269616696983576, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.759711265563965, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.01789938658475876, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.325387001037598, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.02803194336593151, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.735997200012207, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.06130155920982361, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.886418342590332, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.0449640154838562, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.973254919052124, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.009551696479320526, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19050060212612152, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0010942243970930576, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.556612014770508, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0015252988087013364, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.244375228881836, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04529449716210365, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.978147506713867, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.04016077518463135, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.09061050415039, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0017103116260841489, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 13.994242668151855, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.007897221483290195, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.42388916015625, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.01092718355357647, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.13410758972168, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05822041258215904, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.420636177062988, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04463972896337509, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7801096439361572, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.008207290433347225, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15727290511131287, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0011127183679491282, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.33422088623047, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.008049838244915009, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.96333885192871, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.1026243343949318, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.671886444091797, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.08511684089899063, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.015247344970703, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.013341660611331463, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.215439796447754, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.1259041130542755, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.429594993591309, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.1878938525915146, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.156519889831543, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11099589616060257, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.6065034866333, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03015989065170288, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6455416679382324, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.015306876040995121, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15014703571796417, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002756267786026001, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.2904109954834, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0011002789251506329, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.264053344726562, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03286460414528847, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.01937484741211, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.028024185448884964, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.40725326538086, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001493872026912868, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.926856994628906, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.01259611640125513, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.327155113220215, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.020150810480117798, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.720136642456055, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04059341922402382, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.560823440551758, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.022278236225247383, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.0772178173065186, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.019622892141342163, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19582346081733704, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0025265810545533895, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.16101837158203, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0019724464509636164, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.624771118164062, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03588184714317322, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.04960060119629, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.03108914941549301, "pnorm/_forward_module.model.norm.weight": 29.828536987304688, "gnorm/_forward_module.model.norm.weight": 0.0037915001157671213, "pnorm/_forward_module.lm_head.weight": 223.3860626220703, "gnorm/_forward_module.lm_head.weight": 0.0574253611266613} +{"step": 1132462080, "pnorm/_forward_module.model.embeddings.weight": 140.41250610351562, "gnorm/_forward_module.model.embeddings.weight": 0.07005149126052856, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.832460403442383, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002700255950912833, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.071003913879395, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008405433036386967, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.803256034851074, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.011376790702342987, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.96272087097168, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.096554234623909, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.93791389465332, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.09428264945745468, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.067250967025757, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006935805082321167, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3169975280761719, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.001108950818888843, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.062971115112305, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0014956947416067123, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.027990341186523, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05210975930094719, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.14151954650879, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.058231186121702194, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.29786491394043, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0017995196394622326, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.085312843322754, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007773655001074076, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.912480354309082, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.010244077071547508, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.333538055419922, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06472166627645493, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.41759967803955, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05759293586015701, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3558666706085205, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008852451108396053, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15881334245204926, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007335864356718957, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35672378540039, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001103862188756466, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.616374969482422, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.042299337685108185, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.532445907592773, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.035978421568870544, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.66423225402832, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.001750954077579081, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.777522087097168, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010396602563560009, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.338258743286133, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01565021276473999, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.733648300170898, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.057859763503074646, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.884726524353027, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.04223340377211571, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9772961139678955, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.011175453662872314, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19068792462348938, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0010026647942140698, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.557971954345703, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0014360827626660466, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.25456428527832, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04287739843130112, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.987411499023438, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03668570518493652, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.0987606048584, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0015515448758378625, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.028903007507324, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006710642483085394, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.453722953796387, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009555593132972717, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.1367826461792, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.053337499499320984, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.425265312194824, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.04066028445959091, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7843172550201416, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006592963822185993, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1575927734375, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0009665488614700735, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.3295955657959, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004914171062409878, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.956836700439453, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.0707348883152008, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.669090270996094, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.07149539887905121, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.02239990234375, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.009273354895412922, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.238860130310059, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.07497520744800568, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.448932647705078, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.11690657585859299, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.167336463928223, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.133189857006073, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.631538391113281, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.03181123360991478, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6487135887145996, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.011915693990886211, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15036359429359436, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0020220731385052204, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.292638778686523, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009897155687212944, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.277585983276367, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.033604398369789124, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.03029441833496, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.028782380744814873, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.42422103881836, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0015594623982906342, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.952001571655273, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.01136582251638174, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.345212936401367, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.019411150366067886, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.741652488708496, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.04494774714112282, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.597099304199219, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.024063998833298683, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.085348129272461, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.009353731758892536, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1965925097465515, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0012521554017439485, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.167526245117188, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0020771340932697058, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.64804458618164, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.041973527520895004, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.066171646118164, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.035456135869026184, "pnorm/_forward_module.model.norm.weight": 29.90931510925293, "gnorm/_forward_module.model.norm.weight": 0.003897160990163684, "pnorm/_forward_module.lm_head.weight": 223.91432189941406, "gnorm/_forward_module.lm_head.weight": 0.0787363052368164} +{"step": 1153433600, "pnorm/_forward_module.model.embeddings.weight": 140.52688598632812, "gnorm/_forward_module.model.embeddings.weight": 0.06542657315731049, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.8276309967041, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0023350240662693977, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.095189094543457, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007055103313177824, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.824127197265625, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008356685750186443, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.952077865600586, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.09232486039400101, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.927433013916016, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.0894157886505127, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.068364381790161, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.00553342467173934, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3184730112552643, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0005585517501458526, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.05598258972168, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0016690699849277735, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.025575637817383, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.04894594848155975, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.13996696472168, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05593087896704674, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.29285430908203, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0015981782926246524, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.09516429901123, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00738826859742403, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.920215606689453, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.00949106551706791, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.325562477111816, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06250336021184921, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.409852981567383, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.054987456649541855, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.358421564102173, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008650490082800388, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15916350483894348, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006118080927990377, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.35833168029785, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001219879719428718, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.626028060913086, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.042713407427072525, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.539480209350586, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.034278854727745056, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.66520118713379, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.001681598019786179, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.791035652160645, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010977651923894882, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.34778881072998, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.016045769676566124, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.73039722442627, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05610594525933266, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.88193416595459, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.038847435265779495, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.983114242553711, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.009384261444211006, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1912103295326233, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009904255857691169, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.5594425201416, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0011901938123628497, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.264535903930664, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03918493539094925, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 19.996305465698242, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.033191829919815063, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.106124877929688, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0013859064783900976, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.060755729675293, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.007188094779849052, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.481351852416992, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.010476584546267986, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.138773918151855, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.047670308500528336, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.429427146911621, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03657669574022293, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7901158332824707, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0062392158433794975, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15800878405570984, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0007443256326951087, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.32451057434082, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004150868859142065, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.95003318786621, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06020720675587654, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.666358947753906, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06184646487236023, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.02960205078125, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.007746866438537836, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.260727882385254, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.06404006481170654, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.466926574707031, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.09898947924375534, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.177450180053711, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11513212323188782, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.656038284301758, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02886536903679371, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6502676010131836, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.00948623102158308, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1504761129617691, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.001400056411512196, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29306983947754, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009231743752025068, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.287900924682617, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.030231960117816925, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.03913688659668, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.027671104297041893, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.44093894958496, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0015845431480556726, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.975945472717285, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.014731072820723057, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.3623628616333, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.023638099431991577, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.76258373260498, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0376235730946064, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.632291793823242, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.021287458017468452, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.0913822650909424, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.031196292489767075, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19708634912967682, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0039021812845021486, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.174711227416992, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.001524546998552978, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.672576904296875, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.0351446159183979, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.083084106445312, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.030639000236988068, "pnorm/_forward_module.model.norm.weight": 29.987319946289062, "gnorm/_forward_module.model.norm.weight": 0.004556950647383928, "pnorm/_forward_module.lm_head.weight": 224.4150848388672, "gnorm/_forward_module.lm_head.weight": 0.057013604789972305} +{"step": 1174405120, "pnorm/_forward_module.model.embeddings.weight": 140.63111877441406, "gnorm/_forward_module.model.embeddings.weight": 0.06350483745336533, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.823081970214844, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0023480236995965242, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.118627548217773, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006803302094340324, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.844606399536133, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007823570631444454, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.9418306350708, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08600746840238571, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.917404174804688, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.0824626088142395, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0703773498535156, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0054673501290380955, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.31995582580566406, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0002979365235660225, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.050016403198242, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0013084125239402056, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.02397918701172, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0466214083135128, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.138792037963867, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.054467473179101944, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.288494110107422, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0014926716685295105, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.106168746948242, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007602308876812458, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.928791999816895, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009807465597987175, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.318020820617676, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.056647345423698425, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.402556419372559, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05159002169966698, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3613884449005127, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010095944628119469, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.15968014299869537, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003868946514558047, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.359798431396484, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010853500571101904, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.634817123413086, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03873531147837639, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.5457706451416, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03279959037899971, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.666156768798828, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.001767151989042759, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.804082870483398, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010467530228197575, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.357304573059082, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.017300723120570183, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.727706909179688, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.052001748234033585, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.87964153289795, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.0378662645816803, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9844725131988525, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007691247388720512, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19111543893814087, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007603327976539731, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.56175994873047, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001080596586689353, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.275146484375, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.037008874118328094, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.00554084777832, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03217816725373268, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.112802505493164, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001278562587685883, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.09032154083252, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005765886977314949, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.506616592407227, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008396543562412262, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.140140533447266, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.04523066058754921, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.43287467956543, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.036358170211315155, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.793832302093506, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004778907168656588, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15839727222919464, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005801309016533196, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.31954002380371, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0017684909980744123, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.943429946899414, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04153585061430931, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.663700103759766, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.058267101645469666, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.035993576049805, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.005280831828713417, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.280735969543457, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.030128300189971924, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.483148574829102, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.04254244267940521, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.186250686645508, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.12592770159244537, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.676959991455078, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.028317324817180634, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6525418758392334, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.007228670176118612, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15067847073078156, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0009971370454877615, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.295989990234375, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009439450222998857, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.300559997558594, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03108772076666355, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.049409866333008, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.026877041906118393, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.456787109375, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0013797342544421554, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 13.998513221740723, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.012155037373304367, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.378240585327148, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01951027289032936, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.782970428466797, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03889621049165726, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.666496276855469, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.021720662713050842, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.096707344055176, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.018112648278474808, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19759953022003174, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.002550828969106078, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.18120765686035, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0013745242031291127, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.695024490356445, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03351172059774399, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.098674774169922, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.028699547052383423, "pnorm/_forward_module.model.norm.weight": 30.06251335144043, "gnorm/_forward_module.model.norm.weight": 0.002978721633553505, "pnorm/_forward_module.lm_head.weight": 224.88365173339844, "gnorm/_forward_module.lm_head.weight": 0.05117204412817955} +{"step": 1195376640, "pnorm/_forward_module.model.embeddings.weight": 140.725830078125, "gnorm/_forward_module.model.embeddings.weight": 0.06494330614805222, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.819473266601562, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002459406852722168, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.14330005645752, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.0069696721620857716, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.866477966308594, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.0077680968679487705, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.932450294494629, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.0911393016576767, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.908172607421875, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.0884045660495758, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0714809894561768, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005528942681849003, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.32155388593673706, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0005000746459700167, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.043289184570312, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0013527829432860017, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.02115249633789, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.05034896358847618, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.13680648803711, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05612659826874733, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.283655166625977, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.001525079132989049, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.115068435668945, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00812515802681446, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.935296058654785, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.010698404163122177, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.310400009155273, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.061073243618011475, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.394925117492676, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0538199208676815, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.365161895751953, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009302590973675251, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16012662649154663, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005163921159692109, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.360118865966797, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0011881417594850063, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.64143943786621, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.041416559368371964, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.55055046081543, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03439434990286827, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.66729164123535, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0017167292535305023, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.81755542755127, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010764461010694504, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.367280006408691, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.015863537788391113, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.725038528442383, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05638404190540314, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.877326011657715, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.039018187671899796, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9865124225616455, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.006826121360063553, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19134050607681274, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00038134498754516244, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.563236236572266, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001210696529597044, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.283723831176758, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.038872286677360535, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.013227462768555, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.033985208719968796, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.120872497558594, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001242728321813047, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.12183952331543, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006306661292910576, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.53380298614502, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009291821159422398, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.142870903015137, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.047643691301345825, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.437219619750977, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03793969377875328, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.7968833446502686, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.006391364615410566, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15865860879421234, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008067268645390868, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.314659118652344, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0034443815238773823, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.936071395874023, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05358745902776718, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.660324096679688, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06263459473848343, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.042970657348633, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.006755499634891748, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.301055908203125, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.05289757251739502, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.499591827392578, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.07540543377399445, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.194709777832031, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.12856218218803406, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.697431564331055, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02826029248535633, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.655552864074707, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.008208603598177433, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15095531940460205, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0013492002617567778, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.297107696533203, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009376388043165207, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.309925079345703, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.030241655185818672, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.05777931213379, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.025929324328899384, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.47197151184082, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0012892205268144608, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.020493507385254, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.009652532637119293, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.393706321716309, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.015605742111802101, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.803006172180176, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.0360952690243721, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.699627876281738, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01996314711868763, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1017580032348633, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.013842535205185413, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1981411874294281, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0017976779490709305, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.187320709228516, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0013340591685846448, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.71595001220703, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.03081437386572361, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.112850189208984, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.028515635058283806, "pnorm/_forward_module.model.norm.weight": 30.13469886779785, "gnorm/_forward_module.model.norm.weight": 0.00422003073617816, "pnorm/_forward_module.lm_head.weight": 225.32162475585938, "gnorm/_forward_module.lm_head.weight": 0.04920729994773865} +{"step": 1216348160, "pnorm/_forward_module.model.embeddings.weight": 140.8114471435547, "gnorm/_forward_module.model.embeddings.weight": 0.0626826286315918, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.815256118774414, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0023456683848053217, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.165800094604492, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.00826957169920206, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.886083602905273, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009887191466987133, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.922578811645508, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08878620713949203, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.89871597290039, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08449744433164597, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0732061862945557, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.007411227561533451, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.32296451926231384, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0013828517403453588, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.03859519958496, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0012895704712718725, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.020580291748047, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.04802636802196503, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.136234283447266, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.058734022080898285, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.278989791870117, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0017537750536575913, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.123615264892578, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007729469332844019, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.941476821899414, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009528870694339275, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.302790641784668, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06211109831929207, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.387563705444336, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05310742184519768, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.367689609527588, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008299448527395725, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16065849363803864, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.000878406222909689, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.362092971801758, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0011105082230642438, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.649892807006836, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.04063405841588974, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.556821823120117, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03552047163248062, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.668930053710938, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0016897948225960135, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.831611633300781, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009376207366585732, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.377381324768066, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.013738220557570457, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.722373962402344, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05798395350575447, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.875232696533203, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.037252478301525116, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9894704818725586, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007744812406599522, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19165392220020294, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006725696730427444, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.56380844116211, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0015269556315615773, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.290443420410156, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03905156999826431, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.019821166992188, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03532673791050911, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.127216339111328, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001415669801644981, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.149659156799316, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005935473833233118, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.557770729064941, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009271674789488316, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.143608093261719, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.050097595900297165, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.439607620239258, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03682173416018486, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8017823696136475, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005373160354793072, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15917734801769257, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005838232464157045, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.310226440429688, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.005881119053810835, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.929418563842773, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.07551079988479614, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.657411575317383, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.07536865770816803, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.048992156982422, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.011006909422576427, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.320734977722168, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.0923554077744484, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.515235900878906, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.1441965103149414, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.203204154968262, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11370545625686646, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.717062950134277, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.026641616597771645, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6572399139404297, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.011420720256865025, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1510688066482544, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002109326422214508, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.298248291015625, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0010590213350951672, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.31850242614746, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02893798239529133, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.06535530090332, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.023651868104934692, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.48729705810547, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0011939649702981114, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.041974067687988, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00638969661667943, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.409256935119629, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.010127800516784191, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.822553634643555, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03357662260532379, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.732038497924805, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01874340884387493, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.107903242111206, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007491940166801214, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.1986711323261261, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0009266718989238143, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.193706512451172, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012013771338388324, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.736635208129883, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.028044624254107475, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.12766456604004, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.025445064529776573, "pnorm/_forward_module.model.norm.weight": 30.205249786376953, "gnorm/_forward_module.model.norm.weight": 0.00436502555385232, "pnorm/_forward_module.lm_head.weight": 225.7384490966797, "gnorm/_forward_module.lm_head.weight": 0.04454909265041351} +{"step": 1237319680, "pnorm/_forward_module.model.embeddings.weight": 140.88856506347656, "gnorm/_forward_module.model.embeddings.weight": 0.0634930357336998, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.811906814575195, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002276252256706357, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.187747955322266, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.008500587195158005, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.905606269836426, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009774391539394855, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.913702011108398, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08863546699285507, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.889825820922852, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08559013903141022, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.073880910873413, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006835530046373606, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.32437756657600403, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0018297643400728703, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.032623291015625, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0014122406719252467, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.01785659790039, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.048472434282302856, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.134231567382812, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05845370143651962, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.27420997619629, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0018561139004305005, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.131107330322266, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007951578125357628, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.947081565856934, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.01031598262488842, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.295242309570312, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06294641643762589, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.379984855651855, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05420217290520668, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3691065311431885, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.010948838666081429, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1609305888414383, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010666393209248781, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.36431312561035, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0010320116998627782, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.658199310302734, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.040178991854190826, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.562763214111328, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.036282509565353394, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.670513153076172, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0018190190894529223, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.845538139343262, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.01257126871496439, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.387794494628906, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.019655248150229454, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.719339370727539, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05515744164586067, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.872395515441895, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.039537880569696426, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.992725133895874, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007054134272038937, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19196820259094238, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00070453982334584, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.56476593017578, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0014397967606782913, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.297433853149414, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.04111970588564873, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.026233673095703, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03698442503809929, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.133636474609375, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0014907352160662413, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.17672061920166, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005961145740002394, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.581433296203613, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009039917029440403, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.144224166870117, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.054937947541475296, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.441853523254395, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.040717776864767075, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8075358867645264, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005290582776069641, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1596459001302719, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006201759679242969, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.30588150024414, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.006508999038487673, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.922176361083984, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08556639403104782, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.6541748046875, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.08161702752113342, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.054922103881836, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.011806984432041645, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.339011192321777, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.10433805733919144, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.530226707458496, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.15424844622612, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.2113037109375, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11772645264863968, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.736613273620605, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.028383223339915276, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.658351182937622, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.01260099932551384, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15111024677753448, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002273100893944502, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.298852920532227, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009777629747986794, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.326156616210938, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03105182573199272, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.07256507873535, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.025182297453284264, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.502708435058594, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0011959399562329054, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.063056945800781, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00811985693871975, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.42474365234375, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.012593698687851429, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.841636657714844, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.035285912454128265, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.763352394104004, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.019012536853551865, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1150004863739014, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.011600450612604618, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19927240908145905, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0011927419109269977, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.19883155822754, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0010952370939776301, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.75434112548828, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02943258173763752, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.140674591064453, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.0256598312407732, "pnorm/_forward_module.model.norm.weight": 30.274080276489258, "gnorm/_forward_module.model.norm.weight": 0.004100180696696043, "pnorm/_forward_module.lm_head.weight": 226.13233947753906, "gnorm/_forward_module.lm_head.weight": 0.04004082828760147} +{"step": 1258291200, "pnorm/_forward_module.model.embeddings.weight": 140.9578094482422, "gnorm/_forward_module.model.embeddings.weight": 0.06063740700483322, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.808208465576172, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0021605438087135553, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.208089828491211, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007497465703636408, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.923527717590332, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008646723814308643, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.904733657836914, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.0874607115983963, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.880937576293945, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08318505436182022, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.075662612915039, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006078120321035385, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.32572269439697266, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0005800988292321563, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.027965545654297, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001282096141949296, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.016193389892578, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.04720378667116165, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.132919311523438, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.055415183305740356, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.270435333251953, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0015385064762085676, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.14002513885498, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006994299124926329, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.953818321228027, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009982168674468994, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.288522720336914, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.05948098376393318, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.373271942138672, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.052878957241773605, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3705997467041016, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007546186447143555, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16129229962825775, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00040453753899782896, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.366878509521484, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001112981466576457, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.666690826416016, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03968862444162369, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.568706512451172, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03357897698879242, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.6713924407959, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0014042711118236184, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.856966018676758, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008254819549620152, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.395910263061523, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.012137032113969326, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.71658706665039, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05386349931359291, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.869951248168945, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.037743423134088516, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9933462142944336, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.00630616070702672, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19193291664123535, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00047271366929635406, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.566991806030273, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0011039265664294362, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.305862426757812, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03850811347365379, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.03369903564453, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03371669724583626, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.139089584350586, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0013204512652009726, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.20042610168457, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005356659647077322, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.601859092712402, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008685889653861523, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14430046081543, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.049681346863508224, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.443320274353027, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03789716958999634, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.812361478805542, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004759907256811857, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.15999598801136017, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0004021568747702986, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.30185317993164, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0038950382731854916, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.915987014770508, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05837811157107353, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.651830673217773, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06660016626119614, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.06048011779785, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.007481568027287722, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.357412338256836, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.061752352863550186, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.545580863952637, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.08818324655294418, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.218530654907227, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.12499909847974777, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.75312328338623, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.029026515781879425, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.658860683441162, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.009001716040074825, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1511540412902832, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0013007082743570209, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.300247192382812, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009669375140219927, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.33449363708496, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.03064950555562973, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.080001831054688, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.029021743685007095, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.517377853393555, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0015840606065467, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.08360767364502, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.016498740762472153, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.439476013183594, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.027810007333755493, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.859371185302734, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03765421733260155, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.792799949645996, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.020643655210733414, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1188695430755615, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.028147898614406586, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19968239963054657, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.003798685036599636, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.20530128479004, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012400287669152021, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.77415657043457, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.029973367229104042, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.15461540222168, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.025917628780007362, "pnorm/_forward_module.model.norm.weight": 30.339509963989258, "gnorm/_forward_module.model.norm.weight": 0.0035744858905673027, "pnorm/_forward_module.lm_head.weight": 226.49708557128906, "gnorm/_forward_module.lm_head.weight": 0.03992973640561104} +{"step": 1279262720, "pnorm/_forward_module.model.embeddings.weight": 141.0197296142578, "gnorm/_forward_module.model.embeddings.weight": 0.059265125542879105, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.804580688476562, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.00214063236489892, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.228760719299316, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007351779378950596, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.941893577575684, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008539681322872639, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.895759582519531, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08555971086025238, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.872016906738281, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08264566957950592, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.076246976852417, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005955498665571213, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.326959490776062, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0009867704939097166, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.023134231567383, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0012404591543599963, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.01430320739746, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.04572124034166336, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.13139533996582, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05598977953195572, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.266578674316406, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0016678695101290941, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.148319244384766, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007479813881218433, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.960192680358887, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009319275617599487, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.281804084777832, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.060610514134168625, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.366503715515137, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05343422293663025, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3710978031158447, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008580000139772892, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16148562729358673, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0010502575896680355, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.368160247802734, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0011445165146142244, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.67296028137207, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.040754079818725586, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.573169708251953, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.034627217799425125, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.673179626464844, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0016448916867375374, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.870048522949219, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011335162445902824, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.405338287353516, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01534795481711626, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.71405029296875, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.0577065572142601, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.867740631103516, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.038673605769872665, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.9945075511932373, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008968210779130459, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19188560545444489, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008577621192671359, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.56913948059082, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0014671996468678117, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.313526153564453, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.039050377905368805, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.04085922241211, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.034720342606306076, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.145397186279297, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0013484241208061576, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.225082397460938, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005920679308474064, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.623130798339844, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008630525320768356, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.145188331604004, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.05139915272593498, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.44591236114502, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03930145129561424, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8156096935272217, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005254392512142658, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16032586991786957, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006090838578529656, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.29776382446289, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.006601288449019194, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.909460067749023, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08456316590309143, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.648834228515625, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.07769382745027542, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.066225051879883, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.011373800225555897, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.37394905090332, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.09747425466775894, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.559517860412598, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.15151214599609375, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.225625991821289, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.10417687147855759, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.77004623413086, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.026985928416252136, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.659838914871216, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.014291839674115181, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1511983424425125, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002675868570804596, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.301050186157227, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000995783251710236, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.3411865234375, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.030422614887356758, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.08625030517578, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.025149697437882423, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.531007766723633, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0011410866864025593, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.101544380187988, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.0083090178668499, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.4524507522583, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.013512028381228447, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.877148628234863, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03380439057946205, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.822219848632812, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.018686331808567047, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1214780807495117, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.012932728976011276, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.19998577237129211, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001653036568313837, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.210494995117188, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0013835449935868382, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.790372848510742, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.030403560027480125, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.16672706604004, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.027733977884054184, "pnorm/_forward_module.model.norm.weight": 30.403493881225586, "gnorm/_forward_module.model.norm.weight": 0.0023765568621456623, "pnorm/_forward_module.lm_head.weight": 226.83792114257812, "gnorm/_forward_module.lm_head.weight": 0.04687945917248726} +{"step": 1300234240, "pnorm/_forward_module.model.embeddings.weight": 141.07455444335938, "gnorm/_forward_module.model.embeddings.weight": 0.05894826352596283, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.800634384155273, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.00198227446526289, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.247485160827637, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006856567692011595, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.95867919921875, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007879210636019707, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.88671875, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08114643394947052, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.863147735595703, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.07795476168394089, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.07735276222229, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005295613780617714, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3281277120113373, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0008252895786426961, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.019065856933594, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001253687427379191, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.01288604736328, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.04418200999498367, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.13018035888672, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.054651472717523575, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.26341438293457, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0016345864860340953, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.155627250671387, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007111826911568642, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.965503692626953, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009722121059894562, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.275693893432617, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.056286267936229706, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.360358238220215, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04981424659490585, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3736448287963867, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007802166976034641, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16184251010417938, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00043400374124757946, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.370744705200195, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0009415352833457291, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.680212020874023, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.037703968584537506, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.578351974487305, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03326768800616264, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.673994064331055, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0017113416688516736, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.880741119384766, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.012100561521947384, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.41299819946289, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.018058916553854942, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.710845947265625, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05177498981356621, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.864806175231934, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03587711229920387, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.997166872024536, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.006480247713625431, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1921638697385788, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000574760022573173, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.570526123046875, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0011055425275117159, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.319629669189453, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.037419483065605164, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.046588897705078, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03221701830625534, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.1511287689209, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0012934647966176271, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.248129844665527, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005547110922634602, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.64323902130127, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.00805575866252184, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.145723342895508, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.04491201043128967, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.447602272033691, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03552280366420746, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.81978702545166, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004381851758807898, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1606668084859848, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005172535311430693, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.293926239013672, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004104199819266796, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.904159545898438, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05899817496538162, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.64664077758789, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06303907185792923, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.071884155273438, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.008204654790461063, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.391129493713379, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.06568442285060883, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.573802947998047, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.09967351704835892, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.232171058654785, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.1162981167435646, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.785712242126465, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02688676118850708, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.661700963973999, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.007540133316069841, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15133067965507507, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0011435893829911947, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.30048370361328, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0008265849319286644, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.345314025878906, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.028929945081472397, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.091135025024414, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02398681826889515, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.544897079467773, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0011622385354712605, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.118243217468262, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006702927406877279, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.464751243591309, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.010317567735910416, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.8942289352417, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.032687485218048096, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.85085391998291, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.018694717437028885, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1247189044952393, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005984886083751917, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20030008256435394, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0006864492315798998, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.216081619262695, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012233592569828033, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.806594848632812, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.028138775378465652, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.178909301757812, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.0258586835116148, "pnorm/_forward_module.model.norm.weight": 30.466035842895508, "gnorm/_forward_module.model.norm.weight": 0.003679960733279586, "pnorm/_forward_module.lm_head.weight": 227.16094970703125, "gnorm/_forward_module.lm_head.weight": 0.040333256125450134} +{"step": 1321205760, "pnorm/_forward_module.model.embeddings.weight": 141.12271118164062, "gnorm/_forward_module.model.embeddings.weight": 0.05914343148469925, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.797168731689453, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0023244298063218594, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.265472412109375, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006826381664723158, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.974549293518066, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007800353690981865, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.878178596496582, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08566804230213165, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.85483169555664, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.07893665879964828, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.078002691268921, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005218514706939459, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3294304609298706, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0006667140987701714, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.014869689941406, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0013641832629218698, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.0108642578125, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.04499753937125206, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.128765106201172, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.053515877574682236, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.260221481323242, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0016426958609372377, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.16357421875, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0065560415387153625, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.971574783325195, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009375795722007751, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.269356727600098, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.05830622836947441, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.354089736938477, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04973026365041733, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.375105381011963, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007541948929429054, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16208921372890472, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008057717350311577, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.371726989746094, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001013649394735694, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.685026168823242, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03810235112905502, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.58220100402832, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03246206417679787, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67453384399414, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.001769588328897953, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.889781951904297, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.013959341682493687, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.41940689086914, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.022305944934487343, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.70811653137207, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05109071359038353, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.862336158752441, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.035724300891160965, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 2.998246908187866, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0077556646429002285, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19210615754127502, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009923495817929506, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.572105407714844, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0010665490990504622, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.325820922851562, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03702347353100777, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.05237579345703, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03208387643098831, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.15675926208496, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001289904466830194, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.269139289855957, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005564711056649685, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.661782264709473, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008329257369041443, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.146617889404297, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.0448724627494812, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.44967269897461, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03578780218958855, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.823983907699585, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005338352173566818, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16095513105392456, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005585936596617103, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.290515899658203, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.00459871394559741, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.899364471435547, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.0640660971403122, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.644987106323242, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.06613844633102417, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.07720947265625, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.008581481873989105, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.405803680419922, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.07227980345487595, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.586116790771484, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.10675112158060074, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.238470077514648, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.11024279147386551, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.800271034240723, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.025619395077228546, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.663219928741455, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.007889938540756702, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1514129638671875, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0013428392121568322, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.300697326660156, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0008640324813313782, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.350128173828125, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02822836861014366, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.09630012512207, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.023723114281892776, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.5585994720459, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0010130447335541248, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.13365364074707, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.007073594257235527, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.476156234741211, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.011169329285621643, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.911296844482422, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.032297227531671524, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.87887191772461, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.018121885135769844, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.129603862762451, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.009648087434470654, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20070742070674896, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0009357236558571458, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.220951080322266, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012635764433071017, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.821165084838867, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.027674376964569092, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.190048217773438, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02538728527724743, "pnorm/_forward_module.model.norm.weight": 30.525667190551758, "gnorm/_forward_module.model.norm.weight": 0.0026498560328036547, "pnorm/_forward_module.lm_head.weight": 227.46197509765625, "gnorm/_forward_module.lm_head.weight": 0.039499469101428986} +{"step": 1342177280, "pnorm/_forward_module.model.embeddings.weight": 141.164794921875, "gnorm/_forward_module.model.embeddings.weight": 0.058640748262405396, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.79366111755371, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.00194554531481117, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.283363342285156, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007533328607678413, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 14.990673065185547, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.009363166987895966, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.869575500488281, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08201615512371063, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.846343994140625, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.07856862246990204, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0785317420959473, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006088461261242628, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3304711878299713, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0010135268094018102, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.011581420898438, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0013272017240524292, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.009660720825195, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.044763240963220596, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.12751007080078, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0507451593875885, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.257417678833008, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0016690606717020273, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.171113014221191, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.007172630168497562, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.977062225341797, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.009726963937282562, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.263723373413086, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06069335713982582, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.348464965820312, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04956686869263649, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3761329650878906, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009081731550395489, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16226404905319214, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007989450241439044, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.373754501342773, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0009934010449796915, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.69076156616211, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03863942250609398, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.586423873901367, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.03142193332314491, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.675485610961914, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0014227313222363591, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.89907169342041, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009142551571130753, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.426260948181152, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.013192582875490189, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.705615043640137, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05139648914337158, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.859855651855469, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03499780595302582, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.000357151031494, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007301134057343006, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19234442710876465, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0009527565562166274, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.573421478271484, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0010775267146527767, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.33114242553711, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03591737523674965, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.057395935058594, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03067995235323906, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.162181854248047, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0012994155986234546, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.290377616882324, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005706945434212685, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.680243492126465, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009243072010576725, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.146925926208496, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.04662926122546196, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.451044082641602, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03414077311754227, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.826756715774536, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0053755370900034904, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.161146879196167, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.000679567048791796, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.287187576293945, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.003955528140068054, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.893970489501953, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05527150630950928, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.642980575561523, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.05838468670845032, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.082050323486328, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.007239924743771553, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.420088768005371, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.05999205633997917, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.597932815551758, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.09255199879407883, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.244279861450195, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.10181547701358795, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.813851356506348, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02616438828408718, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.664363145828247, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.008550380356609821, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1515038162469864, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0014515647199004889, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.299850463867188, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009085623314604163, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.353164672851562, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.028072480112314224, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.100099563598633, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.024137774482369423, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.57217788696289, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0011201450834050775, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.148866653442383, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006679420359432697, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.487471580505371, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.011233535595238209, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.928018569946289, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.033705465495586395, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.905919075012207, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.018850551918148994, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1334283351898193, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007182389497756958, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20107322931289673, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0008393264724873006, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.225757598876953, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0014190205838531256, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.835142135620117, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02948789857327938, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.200576782226562, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02816929668188095, "pnorm/_forward_module.model.norm.weight": 30.582427978515625, "gnorm/_forward_module.model.norm.weight": 0.003229900961741805, "pnorm/_forward_module.lm_head.weight": 227.7449951171875, "gnorm/_forward_module.lm_head.weight": 0.04910266399383545} +{"step": 1363148800, "pnorm/_forward_module.model.embeddings.weight": 141.20095825195312, "gnorm/_forward_module.model.embeddings.weight": 0.05374623090028763, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.790552139282227, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001824843231588602, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.299911499023438, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007191051729023457, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.005489349365234, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008387953974306583, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.861577987670898, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.0700729712843895, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.838410377502441, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.06799611449241638, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0796329975128174, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005609361920505762, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.33148473501205444, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0004245341697242111, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.008216857910156, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0009347792947664857, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.00802993774414, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.038205213844776154, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.12608528137207, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.04079481214284897, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.25428009033203, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0011307531967759132, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.177942276000977, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006765102501958609, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.982071876525879, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.008715745992958546, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.257672309875488, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.045187290757894516, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.342463493347168, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04124993085861206, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.377960205078125, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007256019860506058, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16249772906303406, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00028430490056052804, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.37534523010254, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008669274393469095, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.695573806762695, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03294141963124275, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.58985137939453, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.026452306658029556, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67592430114746, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.000976547715254128, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.906492233276367, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.006685012020170689, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.431175231933594, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.009249130263924599, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.702682495117188, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03978969529271126, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.857046127319336, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03087293915450573, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0009212493896484, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004819781985133886, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19240565598011017, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00034661972313188016, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.57468605041504, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0008481626282446086, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.33583641052246, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03113553673028946, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.061687469482422, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.02588525414466858, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.167390823364258, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0009951089741662145, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.308845520019531, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00497035589069128, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.696150779724121, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.007168017793446779, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.147887229919434, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.035867076367139816, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.452775001525879, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02932261861860752, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8297975063323975, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003714391030371189, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1614442765712738, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002801416558213532, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.283971786499023, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0014904882991686463, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.888288497924805, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03503669053316116, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.640836715698242, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04212933033704758, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.086978912353516, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0037696531508117914, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.433381080627441, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02505960687994957, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.609001159667969, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03767385333776474, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.2499418258667, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.08521991968154907, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.826889038085938, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.023117993026971817, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.665050983428955, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.005273185204714537, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15155072510242462, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0005444155540317297, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.30011749267578, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000710589752998203, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.35724639892578, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.025506025180220604, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.104522705078125, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02224293164908886, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.585180282592773, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0009651709697209299, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.163945198059082, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006553421262651682, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.498613357543945, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.011453812010586262, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.943700790405273, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.028452729806303978, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.931145668029785, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01771644316613674, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.137404680252075, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.006563759874552488, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20141372084617615, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0007686461322009563, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.229713439941406, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0013562324456870556, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.847579956054688, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.028703317046165466, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.209529876708984, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.027434498071670532, "pnorm/_forward_module.model.norm.weight": 30.637121200561523, "gnorm/_forward_module.model.norm.weight": 0.003315381007269025, "pnorm/_forward_module.lm_head.weight": 228.0102081298828, "gnorm/_forward_module.lm_head.weight": 0.04485529288649559} +{"step": 1384120320, "pnorm/_forward_module.model.embeddings.weight": 141.232177734375, "gnorm/_forward_module.model.embeddings.weight": 0.058650482445955276, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.787113189697266, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.002018216298893094, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.315279960632324, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.007539136800915003, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.019359588623047, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008825286291539669, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.853346824645996, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08020761609077454, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.83032512664795, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.07771926373243332, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.080162286758423, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.006202638614922762, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.332518070936203, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0009384734439663589, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.00519561767578, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0014836620539426804, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.006547927856445, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.044472403824329376, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.12481117248535, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05741870403289795, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.251686096191406, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0015970258973538876, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.185342788696289, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006840632762759924, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.987771987915039, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.008424407802522182, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.252098083496094, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.05816105753183365, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.336997032165527, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05021923780441284, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.378664255142212, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.008146763779222965, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1626349687576294, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007612494518980384, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.376850128173828, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0009943468030542135, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.6998348236084, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03818826377391815, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.592926025390625, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.035417478531599045, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67708396911621, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0019145694095641375, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.916311264038086, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.015116201713681221, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.437966346740723, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.022272586822509766, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.699816703796387, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05453027784824371, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.854414939880371, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03627659007906914, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0006039142608643, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0061686295084655285, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19235721230506897, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007327236817218363, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.57683563232422, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0013861807528883219, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.34128189086914, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.038125406950712204, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.066442489624023, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03503452613949776, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.17184066772461, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001256531453691423, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.326518058776855, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006047028116881847, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.711785316467285, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008260744623839855, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14812183380127, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.049443647265434265, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.453910827636719, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03805055841803551, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8316752910614014, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005349011160433292, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16145026683807373, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006320069078356028, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.280517578125, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.007043389603495598, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.881977081298828, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.08885577321052551, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.638086318969727, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.08054155111312866, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.09215545654297, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.011790873482823372, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.44782543182373, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.10693379491567612, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.620986938476562, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.16168992221355438, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.255590438842773, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.10495386272668839, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.839397430419922, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.025939002633094788, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6661930084228516, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.011451495811343193, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.151579350233078, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.002061538863927126, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.3001766204834, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0010234909132122993, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.360416412353516, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.029115770012140274, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.10812759399414, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.024506045505404472, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.59740447998047, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001214071293361485, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.176595687866211, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00908095482736826, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.50783920288086, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.015941698104143143, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.958739280700684, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03184812515974045, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.955282211303711, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.017262596637010574, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.140420436859131, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.015003564767539501, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20164674520492554, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001756508951075375, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.23409080505371, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.001128224190324545, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.85969352722168, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.026378802955150604, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.218425750732422, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02535208687186241, "pnorm/_forward_module.model.norm.weight": 30.689237594604492, "gnorm/_forward_module.model.norm.weight": 0.002233593724668026, "pnorm/_forward_module.lm_head.weight": 228.25726318359375, "gnorm/_forward_module.lm_head.weight": 0.03413098677992821} +{"step": 1405091840, "pnorm/_forward_module.model.embeddings.weight": 141.25833129882812, "gnorm/_forward_module.model.embeddings.weight": 0.05415948107838631, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.78402328491211, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0018000563140958548, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.329947471618652, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006924462504684925, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.032781600952148, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007897059433162212, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.84557056427002, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.07260427623987198, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.822630882263184, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.07157719880342484, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0806212425231934, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004937555640935898, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3333693742752075, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00046684255357831717, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.002613067626953, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0011545694433152676, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.00518035888672, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03855122625827789, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.123626708984375, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.04388918727636337, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.248605728149414, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0012675904436036944, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.190898895263672, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006737087853252888, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.991753578186035, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0087998416274786, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.246384620666504, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.047184813767671585, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.331475257873535, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.044190097600221634, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.379140615463257, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007752858567982912, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16276521980762482, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008044333080761135, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.378419876098633, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0009112003026530147, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.70374870300293, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.0351591520011425, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.595726013183594, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.028599543496966362, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67791175842285, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0014896116917952895, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.924257278442383, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010679430328309536, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.443900108337402, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.017026178538799286, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.697083473205566, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.04351349547505379, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.851632118225098, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03164813295006752, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.001248836517334, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008405469357967377, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19224528968334198, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007650203187949955, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.5781307220459, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0010329480282962322, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.34536361694336, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.0332251638174057, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.070262908935547, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.027882663533091545, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.176557540893555, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001140198903158307, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.34247875213623, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0056751202791929245, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.725225448608398, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.008057800121605396, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.148874282836914, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03741441294550896, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.455514907836914, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.031217413023114204, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.834176778793335, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004951159469783306, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16167257726192474, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005358237540349364, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.277551651000977, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0034964419901371002, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.876548767089844, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.05123686045408249, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.63565444946289, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.05274403840303421, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.096538543701172, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.00644539762288332, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.460235595703125, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.05279520899057388, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.631239891052246, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.07985293865203857, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.260187149047852, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.09103164076805115, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.850116729736328, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.023007025942206383, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.667814254760742, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.008620255626738071, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15166711807250977, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0014039167435839772, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.300125122070312, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0007771641830913723, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.363014221191406, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02567717432975769, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.111433029174805, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02178085222840309, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.60945701599121, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001035600434988737, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.190305709838867, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006778387818485498, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.518033981323242, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.011051727458834648, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.973413467407227, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.028227834030985832, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 12.978568077087402, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.016688702628016472, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.142080307006836, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007087558973580599, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2017400711774826, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00045114484964869916, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.23851203918457, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012396343518048525, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.871734619140625, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02682872675359249, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.226953506469727, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02556586265563965, "pnorm/_forward_module.model.norm.weight": 30.73866081237793, "gnorm/_forward_module.model.norm.weight": 0.0034814453683793545, "pnorm/_forward_module.lm_head.weight": 228.48666381835938, "gnorm/_forward_module.lm_head.weight": 0.0415467843413353} +{"step": 1426063360, "pnorm/_forward_module.model.embeddings.weight": 141.28012084960938, "gnorm/_forward_module.model.embeddings.weight": 0.06143143028020859, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.780841827392578, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.00229337764903903, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.343125343322754, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006920484360307455, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.044458389282227, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.00822452176362276, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.838013648986816, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.08910354226827621, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.815057754516602, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.08457683026790619, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0814008712768555, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005676338914781809, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3342207372188568, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0007715040119364858, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 22.00058364868164, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0014902764232829213, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.00417709350586, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0479637086391449, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.122636795043945, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.05628800764679909, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.24551773071289, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0018297253409400582, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.195685386657715, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00785733386874199, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.994994163513184, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.010560862720012665, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.240538597106934, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.06468556076288223, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.32577896118164, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.05379108339548111, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.380969762802124, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.009467413648962975, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16301283240318298, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0009153638384304941, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.38018226623535, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.001118742162361741, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.707740783691406, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.042473506182432175, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.59857177734375, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.035390038043260574, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.67915916442871, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.001876703230664134, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.932242393493652, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.014147643931210041, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.449552536010742, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.020041435956954956, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.694643020629883, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.05835969001054764, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.849209785461426, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.036664001643657684, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0027120113372803, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008318657986819744, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19226545095443726, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008854373008944094, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.57876968383789, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.001222226652316749, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.348007202148438, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03879677876830101, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.073116302490234, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.03417186439037323, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.180660247802734, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0014247809303924441, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.357261657714844, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.006059303879737854, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.738140106201172, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.009050014428794384, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14871597290039, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.049621615558862686, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.456313133239746, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03666876256465912, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.836935520172119, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0067070359364151955, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16199980676174164, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0008755015442147851, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.275026321411133, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.005864987149834633, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.871463775634766, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.07914095371961594, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.633554458618164, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.07375045865774155, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.10091781616211, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.01053055003285408, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.473670959472656, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.09218183159828186, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.642518043518066, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.14585240185260773, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.26554012298584, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.0896771103143692, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.861485481262207, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02703119069337845, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.668905258178711, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.010807356797158718, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15167659521102905, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0018861411372199655, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.299448013305664, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0009873913368210196, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.364133834838867, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02944166585803032, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.114059448242188, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02424027770757675, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.621122360229492, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.001310874824412167, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.202688217163086, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008222578093409538, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.5269775390625, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01423039473593235, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 11.9876070022583, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.03516367822885513, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.000852584838867, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.018927790224552155, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.145982265472412, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.006846986711025238, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2021321952342987, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0007431220728904009, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.242679595947266, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0014447948196902871, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.882671356201172, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.030102215707302094, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.234933853149414, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.027023527771234512, "pnorm/_forward_module.model.norm.weight": 30.786340713500977, "gnorm/_forward_module.model.norm.weight": 0.0032523951958864927, "pnorm/_forward_module.lm_head.weight": 228.70083618164062, "gnorm/_forward_module.lm_head.weight": 0.04591897502541542} +{"step": 1447034880, "pnorm/_forward_module.model.embeddings.weight": 141.29800415039062, "gnorm/_forward_module.model.embeddings.weight": 0.05258964002132416, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.777780532836914, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001824040780775249, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.356145858764648, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006446984130889177, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.056172370910645, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007319051772356033, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.830506324768066, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.07009615004062653, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.807587623596191, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.06967712938785553, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0821123123168945, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0047168671153485775, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3351103663444519, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00026414936291985214, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.998125076293945, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.001134722027927637, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.002609252929688, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03770115599036217, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.12126350402832, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.03995811939239502, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.24334716796875, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0011333344737067819, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.20146656036377, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006455204915255308, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 11.999241828918457, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.00849646795541048, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.235515594482422, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.046448204666376114, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.320708274841309, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.043121203780174255, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.382667064666748, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00776921771466732, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1631447970867157, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0007328201318159699, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.38278579711914, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008522021817043424, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.712427139282227, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03434569388628006, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.601621627807617, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.026990268379449844, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.68056297302246, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0011590613285079598, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.939781188964844, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008263804949820042, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.45512866973877, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.011462535709142685, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.692529678344727, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.04051613062620163, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.847216606140137, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03148635849356651, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0049619674682617, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.008175786584615707, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19244834780693054, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0007070201099850237, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.57982063293457, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0009730973397381604, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.350919723510742, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.033133480697870255, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.076122283935547, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.02674063853919506, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.184579849243164, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0010417834855616093, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.371920585632324, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005275707691907883, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.751214981079102, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.007814446464180946, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.148221015930176, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03519723564386368, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.456670761108398, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.029805844649672508, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8399524688720703, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005025614984333515, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16223132610321045, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005328648840077221, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.27263832092285, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0015895599499344826, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.866844177246094, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03653561696410179, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.63156509399414, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04529011249542236, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.105085372924805, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.003769832430407405, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.485916137695312, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.025978367775678635, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.652729988098145, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03325425460934639, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.270330429077148, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.08899592608213425, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.872037887573242, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.024067522957921028, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.669525623321533, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.005143071990460157, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15172874927520752, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0005798544734716415, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29937744140625, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000771659251768142, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.366060256958008, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.026611287146806717, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.116830825805664, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.022945253178477287, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.63125991821289, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0010296710534021258, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.213436126708984, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.009367029182612896, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.534589767456055, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01571926847100258, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.00025463104248, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.029697958379983902, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.021187782287598, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01723971962928772, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1484856605529785, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.013487432152032852, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20230819284915924, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0015228039119392633, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.246625900268555, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.001230333000421524, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.892702102661133, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.028739765286445618, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.242645263671875, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.02588404156267643, "pnorm/_forward_module.model.norm.weight": 30.831363677978516, "gnorm/_forward_module.model.norm.weight": 0.003400243818759918, "pnorm/_forward_module.lm_head.weight": 228.90109252929688, "gnorm/_forward_module.lm_head.weight": 0.047863125801086426} +{"step": 1468006400, "pnorm/_forward_module.model.embeddings.weight": 141.31236267089844, "gnorm/_forward_module.model.embeddings.weight": 0.05013102665543556, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.775081634521484, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0015525136841461062, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.368962287902832, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006622446700930595, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.067627906799316, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007500693667680025, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.82346248626709, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.0645046979188919, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.800561904907227, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.06562145799398422, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0828769207000732, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005047605838626623, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3362793028354645, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.000645408290438354, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.995513916015625, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0010446823434904218, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 27.000473022460938, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03636545315384865, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.119388580322266, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.041658833622932434, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.241077423095703, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0011502369306981564, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.206478118896484, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0057373084127902985, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.003098487854004, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.007095671724528074, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.230626106262207, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.04263436049222946, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.315875053405762, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04125181958079338, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.384270668029785, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007089054677635431, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16333702206611633, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00047861470375210047, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.384702682495117, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007941853837110102, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.715898513793945, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03216007724404335, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.604129791259766, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.027870191261172295, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.682024002075195, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0014050680911168456, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.94752025604248, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011101880110800266, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.460402488708496, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.016452597454190254, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.690164566040039, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.039375998079776764, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.845064163208008, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.03062829002737999, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0067365169525146, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.005773080978542566, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19243018329143524, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000580997730139643, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.58149528503418, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0010479808552190661, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.354467391967773, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03235367685556412, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.07958984375, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.027739843353629112, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.18927764892578, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0009558442980051041, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.38760757446289, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004932490177452564, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.764946937561035, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.006948210299015045, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.148770332336426, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03728168085217476, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.457958221435547, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03126651421189308, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.841994524002075, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005382952746003866, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16223062574863434, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006781523115932941, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.269845962524414, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004907122813165188, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.861183166503906, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.06488759815692902, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.62860107421875, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.05747140571475029, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.109296798706055, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.008249299600720406, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.496137619018555, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.07453179359436035, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.661230087280273, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.11037327349185944, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.274636268615723, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07080935686826706, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.882203102111816, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.021933073177933693, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.670177936553955, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.008650572970509529, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1517312228679657, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0014350098790600896, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.298566818237305, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0008075315272435546, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.36655616760254, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02492307871580124, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.118667602539062, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.020654192194342613, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.641489028930664, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0009462415473535657, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.223227500915527, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006026186514645815, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.541764259338379, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.010831179097294807, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.012748718261719, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.02564798854291439, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.040681838989258, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01516793854534626, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1528186798095703, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0052392370998859406, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2027999460697174, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0006737664807587862, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.249488830566406, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0009689174476079643, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.90048599243164, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.023535719141364098, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.248680114746094, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.021787654608488083, "pnorm/_forward_module.model.norm.weight": 30.873689651489258, "gnorm/_forward_module.model.norm.weight": 0.0035547856241464615, "pnorm/_forward_module.lm_head.weight": 229.08653259277344, "gnorm/_forward_module.lm_head.weight": 0.03724076971411705} +{"step": 1488977920, "pnorm/_forward_module.model.embeddings.weight": 141.3234100341797, "gnorm/_forward_module.model.embeddings.weight": 0.05253734812140465, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.772113800048828, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0016981024527922273, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.380290985107422, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006542083341628313, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.077759742736816, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007208861876279116, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.816399574279785, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.06592147797346115, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.79342269897461, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.06700903922319412, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0833911895751953, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004816305357962847, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.33688825368881226, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0004066765250172466, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.99413299560547, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0009714727057144046, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.999711990356445, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03584813326597214, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.118425369262695, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.03649679571390152, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.2392635345459, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0011028603184968233, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.212410926818848, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00666717579588294, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.007695198059082, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.008508727885782719, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.226027488708496, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.044157445430755615, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.311334609985352, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.042394042015075684, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.385383367538452, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007576615549623966, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1635119915008545, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0006312267505563796, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.386943817138672, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008775184978730977, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.719499588012695, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.033379681408405304, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.606647491455078, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.025792749598622322, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.68305778503418, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0011990396305918694, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.953951835632324, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009497753344476223, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.465391159057617, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.013468507677316666, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.687762260437012, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03991460055112839, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.842670440673828, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.030735958367586136, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0088906288146973, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.00623240415006876, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19254277646541595, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0005940856062807143, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.583051681518555, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0009145922376774251, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.3575496673584, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03182484582066536, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.08248519897461, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.025120846927165985, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.193265914916992, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0009410099009983242, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.400923728942871, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0052547939121723175, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.776908874511719, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.007596385665237904, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14890193939209, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03446129336953163, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.458575248718262, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.029325764626264572, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.844266176223755, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004569970536977053, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16237851977348328, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005658991285599768, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.267431259155273, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0027748497668653727, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.856149673461914, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.044703081250190735, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.626022338867188, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.041939131915569305, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.112924575805664, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.004985740873962641, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.505659103393555, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.04140370711684227, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.669149398803711, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.06537748128175735, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.278430938720703, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07044295221567154, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.890669822692871, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.022273501381278038, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.671079397201538, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.007832365110516548, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15178120136260986, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0011537026148289442, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.299421310424805, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0007265189778991044, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.368831634521484, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.025830945000052452, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.121479034423828, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.022416841238737106, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.651479721069336, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0009857782861217856, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.233137130737305, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008845590986311436, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.548938751220703, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01551905833184719, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.024788856506348, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.026185384020209312, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.059568405151367, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.016703125089406967, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1553757190704346, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.012951829470694065, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2030186951160431, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0016150136943906546, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.252771377563477, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0012832039501518011, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.908504486083984, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.026429537683725357, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.254650115966797, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.024065641686320305, "pnorm/_forward_module.model.norm.weight": 30.913516998291016, "gnorm/_forward_module.model.norm.weight": 0.0032282168976962566, "pnorm/_forward_module.lm_head.weight": 229.25845336914062, "gnorm/_forward_module.lm_head.weight": 0.036953262984752655} +{"step": 1509949440, "pnorm/_forward_module.model.embeddings.weight": 141.33163452148438, "gnorm/_forward_module.model.embeddings.weight": 0.05193153768777847, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.769203186035156, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0016872099367901683, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.390849113464355, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006966608576476574, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.087395668029785, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.008413177914917469, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.809537887573242, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.06804339587688446, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.786602020263672, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.06783075630664825, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0843405723571777, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005484515335410833, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.33786919713020325, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00037869837251491845, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.993038177490234, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0009445322211831808, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.998952865600586, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03703344613313675, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.11757469177246, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.03965944051742554, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.237106323242188, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.001119338790886104, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.216598510742188, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006491054780781269, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.010780334472656, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.008938536047935486, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.221681594848633, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.04585679993033409, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.306864738464355, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.04208604618906975, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3857004642486572, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007643653079867363, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1636420488357544, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005517303943634033, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.388408660888672, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008207714417949319, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.721933364868164, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.033649034798145294, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.608234405517578, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02691021002829075, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.683944702148438, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.001429461408406496, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.959382057189941, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010423910804092884, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.469282150268555, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.016386820003390312, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.685415267944336, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.041635576635599136, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.840385437011719, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.030728967860341072, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0099310874938965, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.00526405917480588, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1925516575574875, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0003077512083109468, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.58418846130371, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0009217620827257633, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.359636306762695, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.03200826793909073, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.08485984802246, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0252953190356493, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.196287155151367, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.001020570402033627, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.41177749633789, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.005945454817265272, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.786412239074707, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.00785202719271183, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.148480415344238, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.0345115065574646, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.458809852600098, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.029642432928085327, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.846200466156006, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.005159120075404644, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16255632042884827, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0004712569061666727, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.26522445678711, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0023406515829265118, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.851303100585938, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.04127810522913933, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.623703002929688, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.04175635054707527, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.116493225097656, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0048225694335997105, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.515142440795898, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.0391351543366909, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.677168846130371, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.054032985121011734, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.282176971435547, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07753630727529526, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.899160385131836, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.022321123629808426, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6711299419403076, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.006508544087409973, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15173636376857758, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0008781488286331296, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29921531677246, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000713752000592649, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.369333267211914, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.025298111140727997, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.1231632232666, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.02126453071832657, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.66115379333496, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0011064456775784492, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.242242813110352, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.0065179080702364445, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.555590629577637, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01093506533652544, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.036498069763184, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.025752823799848557, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.077733039855957, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.015995962545275688, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1576952934265137, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007540058810263872, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20317935943603516, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0008925902075134218, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.255863189697266, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0011199692962691188, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.915464401245117, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.026959823444485664, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.260236740112305, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.025378167629241943, "pnorm/_forward_module.model.norm.weight": 30.951486587524414, "gnorm/_forward_module.model.norm.weight": 0.002356578130275011, "pnorm/_forward_module.lm_head.weight": 229.41815185546875, "gnorm/_forward_module.lm_head.weight": 0.035638391971588135} +{"step": 1530920960, "pnorm/_forward_module.model.embeddings.weight": 141.3372802734375, "gnorm/_forward_module.model.embeddings.weight": 0.04841391742229462, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.766447067260742, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0015646313549950719, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.400789260864258, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.0063665020279586315, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.096345901489258, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007209485862404108, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.803022384643555, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.06304460018873215, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.780112266540527, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.06326435506343842, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0856075286865234, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004498713184148073, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.33889591693878174, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0002727353130467236, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.991863250732422, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0009120333124883473, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.998043060302734, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03502418473362923, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.116546630859375, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.03734087198972702, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.235403060913086, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0009609755361452699, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.221500396728516, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005639285314828157, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.014350891113281, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.006754318252205849, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.217493057250977, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.04148973524570465, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.302679061889648, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03995806723833084, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.385917901992798, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005392624996602535, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1636238396167755, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00035987759474664927, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.390199661254883, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008135305834002793, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.724428176879883, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.031364914029836655, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.609888076782227, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.025339864194393158, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.685047149658203, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0012392301578074694, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.965089797973633, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010961826890707016, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.473388671875, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.015894509851932526, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.683260917663574, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.036937471479177475, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.838356018066406, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.029114803299307823, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0105559825897217, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.006906350143253803, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1924651861190796, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0006614853045903146, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.58517074584961, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007744840113446116, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.361331939697266, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.029749510809779167, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.08696937561035, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.023876355960965157, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.19970703125, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0009365943260490894, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.423176765441895, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004792120773345232, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.79623794555664, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.006901762448251247, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.148130416870117, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03285914659500122, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459095001220703, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02761179581284523, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8482306003570557, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0036545570474117994, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16264209151268005, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002525387972127646, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.263275146484375, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001378426793962717, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.847379684448242, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.033951278775930405, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.622053146362305, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03731426224112511, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.120166778564453, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0030745964031666517, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.524160385131836, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02597479149699211, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.684815406799316, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03409360349178314, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.285798072814941, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07370274513959885, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.906930923461914, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.021297218278050423, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6716413497924805, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0041878498159348965, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1517602801322937, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00024401528935413808, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.298364639282227, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0006895376718603075, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.368860244750977, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02365107089281082, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.1243896484375, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.020164739340543747, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.670429229736328, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0007985431584529579, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.25068187713623, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.005592254921793938, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.561952590942383, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.008755877614021301, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.047650337219238, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.02348273992538452, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.09496021270752, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.015185757540166378, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1592087745666504, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.003626700025051832, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20327462255954742, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00025187243591062725, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.258527755737305, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0011376891052350402, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.921674728393555, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.023752881214022636, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.26498031616211, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.023052280768752098, "pnorm/_forward_module.model.norm.weight": 30.98709487915039, "gnorm/_forward_module.model.norm.weight": 0.002909998642280698, "pnorm/_forward_module.lm_head.weight": 229.56576538085938, "gnorm/_forward_module.lm_head.weight": 0.03307252377271652} +{"step": 1551892480, "pnorm/_forward_module.model.embeddings.weight": 141.3407440185547, "gnorm/_forward_module.model.embeddings.weight": 0.04660690203309059, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.763931274414062, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0014899058733135462, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.410350799560547, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.00662748608738184, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.105124473571777, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.0076172687113285065, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.79688549041748, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.05842110142111778, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.7739839553833, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.059329014271497726, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0864734649658203, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004970878828316927, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3398052752017975, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0002769292623270303, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.991344451904297, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0008916803635656834, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.997770309448242, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.033017098903656006, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.1159610748291, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.03345078229904175, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.233366012573242, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0009784854482859373, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.225470542907715, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005888581275939941, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.017339706420898, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0069593144580721855, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.212929725646973, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03836136683821678, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.29824447631836, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03672733157873154, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3865044116973877, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006821371614933014, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16373586654663086, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0008537101675756276, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.39177131652832, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0006980585749261081, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.726455688476562, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.029308974742889404, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.611366271972656, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02339565008878708, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.685989379882812, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.000863801222294569, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.969808578491211, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.006747142411768436, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.477059364318848, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.00835027452558279, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.681181907653809, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03433101996779442, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.836250305175781, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.02725241892039776, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.012617349624634, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.005009262822568417, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19260646402835846, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00040722748963162303, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.586891174316406, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007387946825474501, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.36378288269043, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.0288309957832098, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.089452743530273, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.023050149902701378, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.20259666442871, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0008316697203554213, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.433361053466797, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004896924365311861, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.805166244506836, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.006447851657867432, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.147587776184082, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.029295992106199265, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459115028381348, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02663147635757923, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8495113849639893, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003984327428042889, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16269628703594208, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00038963492261245847, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.261194229125977, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001806875690817833, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.843069076538086, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03476063534617424, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.62029457092285, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03578706085681915, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.123754501342773, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.003543598810210824, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.532736778259277, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02762742154300213, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.692151069641113, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.036372989416122437, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.28872299194336, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06826578825712204, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.913771629333496, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.020621027797460556, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6727254390716553, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.005346538033336401, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15180858969688416, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0007277269032783806, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29783058166504, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0006579891196452081, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.368385314941406, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02362060360610485, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12548828125, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.020003899931907654, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.679641723632812, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0008624054025858641, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.25964641571045, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00583116989582777, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.568631172180176, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.00994083285331726, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.058343887329102, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.02158304490149021, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.111202239990234, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.014513000845909119, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1607871055603027, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.006419248413294554, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20338362455368042, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0006835360545665026, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.261619567871094, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0008764219819568098, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.9283390045166, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.0221656933426857, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.270231246948242, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.020319724455475807, "pnorm/_forward_module.model.norm.weight": 31.02104377746582, "gnorm/_forward_module.model.norm.weight": 0.0033368293661624193, "pnorm/_forward_module.lm_head.weight": 229.7028045654297, "gnorm/_forward_module.lm_head.weight": 0.030175838619470596} +{"step": 1572864000, "pnorm/_forward_module.model.embeddings.weight": 141.34231567382812, "gnorm/_forward_module.model.embeddings.weight": 0.0511101670563221, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.761091232299805, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0014944735448807478, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.418560028076172, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006235671695321798, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.112528800964355, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007074739318341017, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.79068660736084, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.06437401473522186, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.767828941345215, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.06364325433969498, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.086941957473755, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004765678197145462, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34041154384613037, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0004845497605856508, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.990802764892578, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0008579209097661078, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.997217178344727, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0346577912569046, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.11513900756836, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.039726149290800095, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.231517791748047, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0011176248081028461, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.228399276733398, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005880988202989101, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.019572257995605, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0074874055571854115, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.209059715270996, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.04307766631245613, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.29435920715332, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.040068164467811584, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.387221574783325, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00714969402179122, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16385243833065033, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005019159289076924, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.393770217895508, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008677493315190077, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.728885650634766, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03229229524731636, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.613039016723633, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02695317566394806, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.686843872070312, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0012782858684659004, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.974830627441406, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.010295368731021881, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.480842590332031, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01593465358018875, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.67902946472168, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03943950682878494, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.833976745605469, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.029816314578056335, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0134360790252686, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.007052162662148476, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19259196519851685, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0008759571355767548, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.587890625, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0009144030627794564, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.365083694458008, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.031422242522239685, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.091167449951172, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.026540502905845642, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.205978393554688, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.000955247669480741, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.443727493286133, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004960155580192804, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.814384460449219, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.006652272772043943, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.147491455078125, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.036955174058675766, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.4594144821167, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.03044072538614273, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.85109281539917, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0038881185464560986, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16281409561634064, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00039907669997774065, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.259735107421875, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.004425464663654566, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.839826583862305, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.059626128524541855, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.6192684173584, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.05310096591711044, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.12665367126465, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.007235957309603691, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.540616989135742, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.06439444422721863, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.69881820678711, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.09599091857671738, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.29157543182373, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07334250211715698, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.920063972473145, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02341967634856701, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.67322039604187, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.008087008260190487, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15176637470722198, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0013108761049807072, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.297298431396484, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0007821611943654716, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.367847442626953, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.026625119149684906, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.126354217529297, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.021927157416939735, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.688087463378906, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0010033717844635248, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.267772674560547, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.007008403539657593, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.574613571166992, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.011401042342185974, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.06822681427002, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.027282273396849632, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.126143455505371, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01634199731051922, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.161935567855835, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.00953090749680996, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2034720778465271, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.001052219420671463, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.26384162902832, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0009182909270748496, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.933311462402344, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02549002133309841, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.274389266967773, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.022595375776290894, "pnorm/_forward_module.model.norm.weight": 31.052589416503906, "gnorm/_forward_module.model.norm.weight": 0.003593753557652235, "pnorm/_forward_module.lm_head.weight": 229.82864379882812, "gnorm/_forward_module.lm_head.weight": 0.039079051464796066} +{"step": 1593835520, "pnorm/_forward_module.model.embeddings.weight": 141.34226989746094, "gnorm/_forward_module.model.embeddings.weight": 0.048920322209596634, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.758739471435547, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0015286377165466547, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.42687702178955, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006441260222345591, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.120044708251953, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007151174824684858, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.78491497039795, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.06079521030187607, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.762158393859863, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.061314500868320465, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.087322235107422, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.005053061060607433, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3410835564136505, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00040872677345760167, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.989994049072266, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0008550931815989316, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.996349334716797, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03364688530564308, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.114185333251953, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0334794707596302, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.229536056518555, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0009760325192473829, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.231460571289062, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00634295167401433, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.021903991699219, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.007837914861738682, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.205062866210938, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.040487680584192276, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.290358543395996, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.038616668432950974, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3877716064453125, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.007297574542462826, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1639394462108612, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005739738699048758, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.395801544189453, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007807416841387749, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.7311954498291, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.030722618103027344, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.614566802978516, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.023675622418522835, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.68805694580078, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0009266522829420865, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.979864120483398, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007016435731202364, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.48468017578125, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.008664139546453953, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.67728042602539, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03528078645467758, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.832291603088379, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.028138725087046623, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.014346122741699, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0049085356295108795, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19257384538650513, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0005124473827891052, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.589078903198242, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007789076771587133, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.366561889648438, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.029513558372855186, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.092700958251953, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.022875957190990448, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.209251403808594, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0008457418298348784, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.453250885009766, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00508505292236805, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.822916030883789, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.006801978684961796, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.147290229797363, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.030658286064863205, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459641456604004, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.027094844728708267, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8529772758483887, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.00418664887547493, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16282130777835846, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0005261188489384949, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.258058547973633, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0014438950456678867, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.835988998413086, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03462684899568558, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.617464065551758, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03335690498352051, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.129613876342773, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.002820787485688925, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.547597885131836, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02168707363307476, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.704668998718262, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.02756560780107975, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.294089317321777, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06976042687892914, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.9258394241333, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.022036785259842873, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.673563003540039, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.006335618905723095, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15179000794887543, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0006447040941566229, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.296855926513672, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0007523235399276018, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.367206573486328, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.025579005479812622, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127103805541992, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.023026296868920326, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.695764541625977, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0011645004851743579, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.274066925048828, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.010883286595344543, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.579106330871582, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.02049865387380123, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.077529907226562, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.025907723233103752, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.140000343322754, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.015810057520866394, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.164200782775879, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.020050378516316414, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20375747978687286, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0026022789534181356, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.266286849975586, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0009953962871804833, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.93834686279297, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.025253403931856155, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.278522491455078, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.022843917831778526, "pnorm/_forward_module.model.norm.weight": 31.081966400146484, "gnorm/_forward_module.model.norm.weight": 0.0028552794829010963, "pnorm/_forward_module.lm_head.weight": 229.94252014160156, "gnorm/_forward_module.lm_head.weight": 0.03862690553069115} +{"step": 1614807040, "pnorm/_forward_module.model.embeddings.weight": 141.34100341796875, "gnorm/_forward_module.model.embeddings.weight": 0.04720361530780792, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.75642967224121, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0014735623262822628, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.434249877929688, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005979258567094803, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.126725196838379, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.0066452487371861935, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.77946662902832, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.060822900384664536, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.756768226623535, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.0608995258808136, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0875322818756104, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0043783956207334995, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3416237533092499, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0004952540621161461, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.989561080932617, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.00099596893414855, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.99574089050293, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.034568481147289276, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.113462448120117, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.038843732327222824, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.22785758972168, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0010164406849071383, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.234116554260254, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00605815602466464, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.023839950561523, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.007085306104272604, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.201482772827148, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.04089934006333351, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.28678035736084, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03818514198064804, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.388753890991211, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006844379473477602, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16406431794166565, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00048476678784936666, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.397716522216797, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0008764974190853536, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.733348846435547, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.031428683549165726, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.615951538085938, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02615945227444172, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.688640594482422, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0014776411699131131, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.983821868896484, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011906755156815052, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.487586975097656, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.018466467037796974, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.675104141235352, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03695788234472275, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.830079078674316, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.027732806280255318, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.01434326171875, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0053502474911510944, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19251984357833862, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0004936269833706319, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.5899658203125, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0008425931446254253, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.367460250854492, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.029939115047454834, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.09379768371582, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.024154895916581154, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.211984634399414, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0008909463649615645, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.460700035095215, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00480251619592309, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.829201698303223, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.006445455830544233, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.147040367126465, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.03174208477139473, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459756851196289, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02700420841574669, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8549816608428955, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004829673562198877, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16297473013401031, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0006523000774905086, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.25634002685547, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002242951886728406, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.83220672607422, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03873131424188614, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.615859985351562, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.041986625641584396, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.132781982421875, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0043141795322299, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.554150581359863, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.03924781084060669, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.710217475891113, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.05586624518036842, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.29671573638916, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.07275590300559998, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.931479454040527, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.02056742087006569, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.674154281616211, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.00469741877168417, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15176358819007874, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0004519731446634978, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.2960147857666, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0006718530785292387, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.365936279296875, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.023274937644600868, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127418518066406, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.01936795935034752, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.70330238342285, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.000806099153123796, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.28111457824707, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004994237329810858, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.584203720092773, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.00769190676510334, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.08635139465332, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.02246537059545517, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.15308666229248, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.014324829913675785, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.165949583053589, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.004256530199199915, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2038712352514267, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0003398848930373788, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.2685489654541, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.00081594631774351, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.942760467529297, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.021546516567468643, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.282100677490234, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.01945408619940281, "pnorm/_forward_module.model.norm.weight": 31.109350204467773, "gnorm/_forward_module.model.norm.weight": 0.002632853575050831, "pnorm/_forward_module.lm_head.weight": 230.0476531982422, "gnorm/_forward_module.lm_head.weight": 0.031549569219350815} +{"step": 1635778560, "pnorm/_forward_module.model.embeddings.weight": 141.33865356445312, "gnorm/_forward_module.model.embeddings.weight": 0.04757488891482353, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.754228591918945, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001384550821967423, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.441152572631836, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006228774320334196, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.133137702941895, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006767550017684698, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.774319648742676, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.057370979338884354, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.751606941223145, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05874723196029663, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.08780837059021, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004475228022783995, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3422457277774811, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00033900741254910827, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988998413085938, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0008607521303929389, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.994863510131836, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03271064534783363, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.112613677978516, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0334082767367363, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.22633934020996, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.001017098780721426, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.237645149230957, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.006288683973252773, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.026549339294434, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.007115233689546585, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.197785377502441, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.038260962814092636, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.283203125, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.036407604813575745, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3888845443725586, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005560883786529303, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16406819224357605, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005824992549605668, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.399370193481445, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007806739886291325, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.735008239746094, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.03008354641497135, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.61709976196289, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.023673608899116516, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.68962860107422, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.00101920694578439, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.988675117492676, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.009005491621792316, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.491043090820312, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01150592789053917, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.67341423034668, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.033890292048454285, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.828376770019531, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.02702590823173523, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0140671730041504, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004927969072014093, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19240893423557281, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00024775939527899027, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.59140396118164, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.000777918437961489, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.36893653869629, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02873246744275093, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.095317840576172, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.02303978241980076, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.214540481567383, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0008132493239827454, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.468366622924805, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004533540923148394, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.836150169372559, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.00603446876630187, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.146756172180176, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.02806384116411209, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459814071655273, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02595445141196251, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8557450771331787, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0035003244411200285, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16296668350696564, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002932557254098356, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.254663467407227, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.00223099859431386, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.828426361083984, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.038576580584049225, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.614212036132812, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.0375615730881691, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.135725021362305, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.003881220007315278, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.56001091003418, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.03459172323346138, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.715176582336426, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.04954523220658302, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.298830032348633, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06558841466903687, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.936346054077148, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.019765015691518784, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.674858570098877, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0044510760344564915, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15175460278987885, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0002696436131373048, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.296005249023438, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000667211483232677, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.365476608276367, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.023003777489066124, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.128128051757812, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.020570436492562294, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.710079193115234, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0008972841314971447, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.287060737609863, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.009145347401499748, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.588678359985352, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.016169724985957146, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.094293594360352, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.021083880215883255, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.164840698242188, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.014314854517579079, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1673355102539062, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.016458092257380486, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20400190353393555, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0020926084835082293, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.270824432373047, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0008035209029912949, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.946929931640625, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02206314168870449, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.285545349121094, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.019932396709918976, "pnorm/_forward_module.model.norm.weight": 31.13481330871582, "gnorm/_forward_module.model.norm.weight": 0.002822037087753415, "pnorm/_forward_module.lm_head.weight": 230.14434814453125, "gnorm/_forward_module.lm_head.weight": 0.02865600772202015} +{"step": 1656750080, "pnorm/_forward_module.model.embeddings.weight": 141.3353729248047, "gnorm/_forward_module.model.embeddings.weight": 0.045767735689878464, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.7523250579834, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0013880071928724647, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.447632789611816, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006260499823838472, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.139086723327637, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006955510936677456, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.76955509185791, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.05670694634318352, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.74685287475586, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05645138770341873, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0883209705352783, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004344481974840164, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34299567341804504, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0003865035832859576, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988807678222656, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0008230588282458484, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.994258880615234, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03145426884293556, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.111921310424805, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.03078995831310749, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.225000381469727, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0008509355247952044, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.24043083190918, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0060376618057489395, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.02871036529541, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.007183439563959837, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.194581031799316, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03519470617175102, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.280010223388672, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03438444435596466, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3890485763549805, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006133364047855139, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.1641572117805481, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00039407069562003016, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.40087127685547, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007405579672195017, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.736330032348633, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.029081245884299278, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.618013381958008, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02212236076593399, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.690513610839844, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0009631455759517848, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.992246627807617, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.0071508740074932575, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.49371337890625, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.00956418551504612, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.67161750793457, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03050004318356514, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.826481819152832, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.025240924209356308, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.015068292617798, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004594374913722277, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19245830178260803, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00036125193582847714, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.59219741821289, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007131033344194293, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.369598388671875, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.026935169473290443, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.096328735351562, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.021306097507476807, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.216447830200195, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0007444787188433111, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.474422454833984, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0047043669037520885, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.841529846191406, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.006039207335561514, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.146206855773926, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.02483394369482994, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459602355957031, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02398025430738926, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8566036224365234, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0034781296271830797, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16300326585769653, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00031498042517341673, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.252973556518555, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0010222316486760974, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.824726104736328, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.029246358200907707, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.612455368041992, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03061879612505436, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.138673782348633, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0022622286342084408, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.565945625305176, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.017217691987752914, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.720237731933594, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.02297990396618843, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.301126480102539, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.05887453258037567, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.9413480758667, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.01862250827252865, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6752803325653076, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.004864828195422888, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15177635848522186, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0003431830264162272, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.295379638671875, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0006039082072675228, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.36431884765625, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.021491989493370056, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12837028503418, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.018885577097535133, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.716432571411133, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0007775098783895373, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.292031288146973, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006435771472752094, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.592394828796387, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.011443408206105232, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.101969718933105, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.019292263314127922, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.175951957702637, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.013291585259139538, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1689045429229736, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.008386734873056412, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20405954122543335, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0009920165175572038, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.27320098876953, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0007301043951883912, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.951282501220703, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.020319920033216476, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.28899383544922, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.018865354359149933, "pnorm/_forward_module.model.norm.weight": 31.158222198486328, "gnorm/_forward_module.model.norm.weight": 0.0028471893165260553, "pnorm/_forward_module.lm_head.weight": 230.23097229003906, "gnorm/_forward_module.lm_head.weight": 0.030748317018151283} +{"step": 1677721600, "pnorm/_forward_module.model.embeddings.weight": 141.3314971923828, "gnorm/_forward_module.model.embeddings.weight": 0.04457836225628853, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.750410079956055, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0013749272329732776, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.453429222106934, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006441161967813969, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.144365310668945, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007166095543652773, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.764918327331543, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.05552033334970474, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.742239952087402, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05573180690407753, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.088689088821411, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004349282942712307, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3435988128185272, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00029756189906038344, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988555908203125, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.000785376934800297, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.99349594116211, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03101898916065693, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.111061096191406, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.031812816858291626, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.223703384399414, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0009128207457251847, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.24272632598877, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005706528201699257, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.030427932739258, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.007073861546814442, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.191509246826172, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03498519957065582, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.276944160461426, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.034492623060941696, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3895680904388428, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.006328893825411797, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16418664157390594, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.000504787138197571, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.402450561523438, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007585044368170202, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.73763656616211, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.028566794469952583, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.618927001953125, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02255276031792164, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.691267013549805, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.00100757647305727, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.995866775512695, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007994938641786575, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.496451377868652, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.011083870194852352, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.669782638549805, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.030517227947711945, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.82458209991455, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.025622094050049782, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.016174554824829, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004658149555325508, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19240537285804749, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00031011266401037574, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.593311309814453, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.000734008033759892, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.37053871154785, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.026930343359708786, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.097469329833984, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.021812088787555695, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.218647003173828, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0007071401923894882, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.480846405029297, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0044704158790409565, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.84736156463623, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005795625038444996, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.145715713500977, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.025376591831445694, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459341049194336, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.024356767535209656, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8579046726226807, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003279224969446659, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1630816012620926, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002195754204876721, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.25159454345703, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0015127718215808272, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.8217716217041, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03226598724722862, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.611326217651367, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.033500321209430695, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.141538619995117, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0032572660129517317, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.571484565734863, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.023481866344809532, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.72481918334961, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.032515477389097214, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.30329704284668, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.06233000010251999, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.945817947387695, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.01930958963930607, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.675361156463623, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.004867882933467627, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15172478556632996, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00037178758066147566, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29483413696289, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0006033832323737442, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.363096237182617, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.0221747774630785, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.128450393676758, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.019841747358441353, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.72258186340332, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.00096935557667166, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.296998977661133, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008550758473575115, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.596135139465332, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.014956512488424778, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.109275817871094, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.01983051560819149, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.186471939086914, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.013492926955223083, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.170198440551758, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.01723027601838112, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20416401326656342, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.002247962635010481, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.275026321411133, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0008263947675004601, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.95429801940918, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.021269390359520912, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.291799545288086, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.01933368295431137, "pnorm/_forward_module.model.norm.weight": 31.179662704467773, "gnorm/_forward_module.model.norm.weight": 0.0026700040325522423, "pnorm/_forward_module.lm_head.weight": 230.30889892578125, "gnorm/_forward_module.lm_head.weight": 0.028524892404675484} +{"step": 1698693120, "pnorm/_forward_module.model.embeddings.weight": 141.32713317871094, "gnorm/_forward_module.model.embeddings.weight": 0.04646812379360199, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.748613357543945, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0012708855792880058, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.459004402160645, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006520767230540514, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.149462699890137, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007195473648607731, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.760513305664062, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.0557873360812664, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.737835884094238, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05659421160817146, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.088768243789673, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004496078472584486, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34414175152778625, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0002931281633209437, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988605499267578, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007938371854834259, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.993017196655273, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0319770947098732, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.110374450683594, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0311867818236351, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.222606658935547, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0009318734519183636, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.244863510131836, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0059065246023237705, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.032025337219238, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.006881259847432375, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.188813209533691, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03593457117676735, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.274312019348145, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03453003242611885, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.389810800552368, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0053470442071557045, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16418497264385223, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00025779896532185376, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.40403938293457, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007064284291118383, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.738962173461914, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.028566382825374603, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.619709014892578, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02226276323199272, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.69227409362793, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0008894866914488375, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 12.999446868896484, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.008246667683124542, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.49921703338623, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.010065573267638683, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.668205261230469, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.03021909110248089, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.82294750213623, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.025197885930538177, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.017127513885498, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.006021092180162668, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19240275025367737, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0004248698242008686, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.594213485717773, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007393914856947958, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.371129989624023, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.027172435075044632, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.09836196899414, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0211710873991251, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.22066307067871, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0007337935385294259, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.48641300201416, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0044356151483953, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.852331161499023, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005689945537596941, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.145352363586426, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.02573395147919655, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.459196090698242, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02443138137459755, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8589794635772705, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0029780438635498285, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16310212016105652, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00014836432819720358, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.250167846679688, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0014407476410269737, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.818742752075195, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03152673318982124, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.61000633239746, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.03071190044283867, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.14417266845703, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0025847710203379393, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.576818466186523, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02043353207409382, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.729241371154785, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.02806456759572029, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.305159568786621, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.058703966438770294, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.949634552001953, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.019056010991334915, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.675274133682251, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.003863073419779539, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1516776829957962, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.000314348260872066, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29477882385254, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0006044832989573479, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.362510681152344, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.021860146895051003, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12875747680664, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.018365688621997833, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.728137969970703, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0007394634885713458, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.301671028137207, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.005165171343833208, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.599649429321289, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.008052808232605457, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.115818977355957, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.019191857427358627, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.195630073547363, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.013294970616698265, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.171499252319336, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005738761741667986, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2043353170156479, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0003008887288160622, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.276885986328125, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0008119405247271061, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.95726203918457, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.02144782431423664, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.294404983520508, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.01995152235031128, "pnorm/_forward_module.model.norm.weight": 31.199254989624023, "gnorm/_forward_module.model.norm.weight": 0.0031494402792304754, "pnorm/_forward_module.lm_head.weight": 230.37979125976562, "gnorm/_forward_module.lm_head.weight": 0.030216289684176445} +{"step": 1719664640, "pnorm/_forward_module.model.embeddings.weight": 141.3224639892578, "gnorm/_forward_module.model.embeddings.weight": 0.044098444283008575, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.747051239013672, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0012050755321979523, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.46363353729248, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006373214069753885, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.153785705566406, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.007084324024617672, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.756646156311035, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.05301236733794212, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.734058380126953, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05417168140411377, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.08933687210083, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004998547490686178, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34481239318847656, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0001902189542306587, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.98849105834961, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007446189993061125, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.992311477661133, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.030483612790703773, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.109601974487305, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02915780059993267, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.221437454223633, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0008856907952576876, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.246373176574707, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005490125622600317, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.033134460449219, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.00665226299315691, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.186223983764648, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03365691006183624, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.271739959716797, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03310658410191536, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3902347087860107, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005848998203873634, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16427794098854065, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003572382847778499, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.405744552612305, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00073169672396034, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.7403564453125, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.027907395735383034, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.62066650390625, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.021249011158943176, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.692989349365234, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0008750006672926247, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.002433776855469, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.006893828045576811, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.501418113708496, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.009311062283813953, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.666671752929688, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.028285710141062737, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.82129192352295, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.024083560332655907, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.017808437347412, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004369791597127914, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.1924019753932953, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00036636408185586333, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.59502601623535, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007044600788503885, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.371519088745117, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02605457417666912, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.09892463684082, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.020455820485949516, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.222238540649414, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.000696919800247997, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.49091625213623, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00445895828306675, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.856532096862793, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005739685148000717, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.144901275634766, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.024114608764648438, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.45893383026123, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.022920668125152588, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8596014976501465, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.00343241891823709, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16310960054397583, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0003306750732008368, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.24860191345215, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0014910208992660046, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.81548309326172, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.0316833071410656, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.608409881591797, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.029338061809539795, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.146799087524414, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0025683606509119272, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.58143138885498, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02432643249630928, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.733238220214844, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03307809680700302, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.306782722473145, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.05167650058865547, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.953105926513672, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.017786890268325806, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.675560712814331, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0036644453648477793, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.151668980717659, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0002986992767546326, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.294414520263672, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005617919377982616, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.361539840698242, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.020790787413716316, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12881851196289, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.018145909532904625, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.733657836914062, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0007155701168812811, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.305914878845215, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006153097841888666, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.60292911529541, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01006747130304575, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.122344970703125, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.01790950819849968, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.204632759094238, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.012694740667939186, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.172700881958008, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.006039721891283989, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20442448556423187, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0006556878797709942, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.278366088867188, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0008072779164649546, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.959609985351562, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.021046575158834457, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.29648780822754, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.019490491598844528, "pnorm/_forward_module.model.norm.weight": 31.217206954956055, "gnorm/_forward_module.model.norm.weight": 0.0028552778530865908, "pnorm/_forward_module.lm_head.weight": 230.44320678710938, "gnorm/_forward_module.lm_head.weight": 0.027377966791391373} +{"step": 1740636160, "pnorm/_forward_module.model.embeddings.weight": 141.31765747070312, "gnorm/_forward_module.model.embeddings.weight": 0.04374431073665619, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.74544334411621, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0013479077024385333, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.46805477142334, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005862601101398468, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.157923698425293, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006532947067171335, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.752814292907715, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.05464436486363411, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.730278968811035, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05402659252285957, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0894007682800293, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004084280226379633, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3452194631099701, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0003912253596354276, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.98843765258789, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007420337060466409, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.991670608520508, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.030670249834656715, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.108970642089844, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02975250594317913, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.220565795898438, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0008868594304658473, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.248115539550781, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005454434081912041, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.03450870513916, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.006727287080138922, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.184011459350586, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03511591628193855, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.269489288330078, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03310489282011986, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3905839920043945, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00554592814296484, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16425105929374695, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00040611528675071895, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.40719223022461, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007507610716857016, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.741426467895508, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.02812054753303528, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.621349334716797, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02127191238105297, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.69375228881836, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0010065066162496805, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.005029678344727, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007367710582911968, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.503368377685547, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.010648677125573158, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.665535926818848, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.029349099844694138, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.820107460021973, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.024054497480392456, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0178894996643066, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004566058050841093, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19230496883392334, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000343720632372424, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.59613609313965, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007002403144724667, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.37226676940918, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02574189007282257, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.099742889404297, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.02016708068549633, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.223827362060547, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.000676943629514426, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.495512962341309, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004287391435354948, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.860774040222168, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005470627918839455, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14437198638916, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.024206699803471565, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.45868968963623, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02324538119137287, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.860445737838745, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003133704187348485, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16309083998203278, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0003186201793141663, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.247282028198242, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001129628042690456, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.81267547607422, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.028515703976154327, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.60700225830078, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.028810465708374977, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.149188995361328, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0022156876511871815, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.585615158081055, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.017374495044350624, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.736748695373535, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.021080663427710533, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.308297157287598, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.0555841401219368, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.956136703491211, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.018375184386968613, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6758663654327393, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0045777298510074615, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1516610085964203, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00037659023655578494, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.294111251831055, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005856971838511527, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.360551834106445, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.02170778624713421, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12883186340332, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.019352290779352188, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.738540649414062, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0007952895830385387, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.310104370117188, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.008078871294856071, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.606133460998535, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.014348835684359074, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.127949714660645, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.01940801925957203, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.212409973144531, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.013155367225408554, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.173522710800171, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.013458347879350185, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20451031625270844, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0018207915127277374, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.279876708984375, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0007451485143974423, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.961753845214844, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.020600268617272377, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.298538208007812, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.018785132095217705, "pnorm/_forward_module.model.norm.weight": 31.233287811279297, "gnorm/_forward_module.model.norm.weight": 0.002860505599528551, "pnorm/_forward_module.lm_head.weight": 230.4984130859375, "gnorm/_forward_module.lm_head.weight": 0.02886488474905491} +{"step": 1761607680, "pnorm/_forward_module.model.embeddings.weight": 141.3128662109375, "gnorm/_forward_module.model.embeddings.weight": 0.04242623597383499, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.74394989013672, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001239114673808217, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.47174072265625, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005935885943472385, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.16130256652832, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006417996250092983, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.749356269836426, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.050880927592515945, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.726871490478516, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.052323974668979645, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0895023345947266, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.003955993801355362, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34565597772598267, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00024125447089318186, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.9882869720459, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007192182238213718, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.99089241027832, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.028875108808279037, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.1082763671875, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.026964064687490463, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.219635009765625, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0007898774347268045, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.249699592590332, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005372361745685339, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.035743713378906, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.00637369928881526, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.18175220489502, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.032032690942287445, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.267245292663574, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03125397861003876, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.390927791595459, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005169576033949852, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16434097290039062, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0002584047324489802, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.408485412597656, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00069802301004529, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.74228286743164, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.02700229361653328, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.62183380126953, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.020014960318803787, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.69439697265625, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0008538188994862139, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.007386207580566, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.006125689018517733, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.50515365600586, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.008358290418982506, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.664336204528809, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.02842821180820465, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.818830490112305, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.022983407601714134, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.018373966217041, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.005251115653663874, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19233326613903046, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00039881363045424223, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.59699821472168, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0006587339448742568, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.372703552246094, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.024479229003190994, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.100383758544922, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.01867910660803318, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.225322723388672, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.000741486088372767, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.499319076538086, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004223366267979145, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.864100456237793, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005441361106932163, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.144010543823242, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.02334924228489399, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.458531379699707, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02196965366601944, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8614375591278076, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003326319856569171, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16316543519496918, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0003399289562366903, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.246122360229492, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0013873933348804712, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.81016731262207, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.029685668647289276, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.605783462524414, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.024066831916570663, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.151409149169922, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0023149061016738415, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.589319229125977, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02264171838760376, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.739851951599121, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.030083751305937767, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.309725761413574, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.03906789422035217, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.958964347839355, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.01659349910914898, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.676025390625, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.00397244468331337, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1516624093055725, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0002968880580738187, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29405403137207, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005444154376164079, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.35979652404785, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.019778765738010406, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.128908157348633, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.016906000673770905, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.742839813232422, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0007065861136652529, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.3133544921875, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006025639362633228, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.608555793762207, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.009512268006801605, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.133191108703613, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.01563129760324955, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.219612121582031, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.011841128580272198, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.174015522003174, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007425522431731224, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20457735657691956, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0008805630495771766, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.28134536743164, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0006317324587143958, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.963899612426758, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.018571754917502403, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.30035972595215, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.016866568475961685, "pnorm/_forward_module.model.norm.weight": 31.247705459594727, "gnorm/_forward_module.model.norm.weight": 0.0028041834011673927, "pnorm/_forward_module.lm_head.weight": 230.54808044433594, "gnorm/_forward_module.lm_head.weight": 0.024733761325478554} +{"step": 1782579200, "pnorm/_forward_module.model.embeddings.weight": 141.30825805664062, "gnorm/_forward_module.model.embeddings.weight": 0.04376252368092537, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.742685317993164, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0012898995773866773, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.475171089172363, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.006154044531285763, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.164497375488281, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.00683760829269886, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.746288299560547, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.05268286541104317, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.723827362060547, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05299519747495651, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0894336700439453, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004461096134036779, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3459966778755188, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00040403963066637516, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988378524780273, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007294489769265056, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.990398406982422, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.03046494722366333, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.107681274414062, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.029264362528920174, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.21891212463379, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0008650976233184338, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.2510986328125, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005510939750820398, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.036839485168457, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.006578164640814066, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.179903030395508, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03303442522883415, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.265384674072266, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0322522334754467, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3908510208129883, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004886684473603964, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16423611342906952, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00032275827834382653, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.409650802612305, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.00073054718086496, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.743005752563477, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.02738231234252453, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.622270584106445, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.020985959097743034, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.694520950317383, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0007787040085531771, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.00904369354248, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007099831011146307, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.50649642944336, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.008473278023302555, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.662870407104492, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.028002368286252022, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.81733512878418, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.024197222664952278, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.01823353767395, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0048216478899121284, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19219885766506195, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0004495200701057911, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.597875595092773, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.000701993121765554, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.37311553955078, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.025777896866202354, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.100955963134766, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0201650932431221, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.226829528808594, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0006988884997554123, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.503414154052734, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0042933207005262375, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.867803573608398, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005476228892803192, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.143648147583008, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.023684924468398094, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.458301544189453, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02343611977994442, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8619906902313232, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.00287159183062613, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1631055772304535, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00021930279035586864, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.24517059326172, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0013145327102392912, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.808008193969727, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.030892981216311455, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.604907989501953, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.028328049927949905, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.153366088867188, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0021365832071751356, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.59267807006836, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.01854988932609558, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.742753028869629, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.023898480460047722, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.310995101928711, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.05037117749452591, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.96133804321289, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.017826072871685028, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6760201454162598, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.004757072776556015, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.151624396443367, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0002623242326080799, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.293542861938477, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005538974655792117, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.35841178894043, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.020045241340994835, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.128637313842773, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.017198530957102776, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.746788024902344, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.000712825043592602, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.31606388092041, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004861199297010899, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.61056137084961, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006968318950384855, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.13782787322998, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.01784580387175083, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.225886344909668, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.012429581955075264, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1749846935272217, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.004118995275348425, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2047228366136551, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0002863032859750092, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.282434463500977, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0006591529818251729, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.96518898010254, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.019008895382285118, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.301733016967773, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.017291050404310226, "pnorm/_forward_module.model.norm.weight": 31.260488510131836, "gnorm/_forward_module.model.norm.weight": 0.0027645945083349943, "pnorm/_forward_module.lm_head.weight": 230.59144592285156, "gnorm/_forward_module.lm_head.weight": 0.026357533410191536} +{"step": 1803550720, "pnorm/_forward_module.model.embeddings.weight": 141.3038330078125, "gnorm/_forward_module.model.embeddings.weight": 0.0414767861366272, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.741273880004883, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001209333073347807, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.47747802734375, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005521667655557394, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.166640281677246, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006047818344086409, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.74338436126709, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.05135146528482437, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.720975875854492, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.052284158766269684, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0898168087005615, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0038229150231927633, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34645915031433105, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00041592129855416715, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988435745239258, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007449144031852484, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.989892959594727, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.02912786602973938, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.107118606567383, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02923515997827053, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.2181339263916, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0007790939998812973, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.25199031829834, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005341562442481518, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.037497520446777, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0063261790201067924, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.178116798400879, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.032515715807676315, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.26358699798584, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.03166607394814491, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.391305446624756, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005807955749332905, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16431716084480286, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0004574889608193189, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.410547256469727, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007178762461990118, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.743457794189453, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.027170995250344276, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.622526168823242, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.020978420972824097, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.695068359375, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.000827554555144161, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.010787010192871, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.00621257396414876, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.507866859436035, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.008370630443096161, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.662008285522461, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.028891421854496002, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.816421508789062, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.022995654493570328, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0182981491088867, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004362782929092646, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19215358793735504, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0002743955119512975, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.598468780517578, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007299768622033298, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373193740844727, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.025586159899830818, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.101261138916016, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.02031632326543331, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.228086471557617, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.00069015211192891, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.506431579589844, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004302767105400562, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.870383262634277, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005728777032345533, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14344310760498, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.023542020469903946, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.458202362060547, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02252981811761856, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.86248779296875, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.004104613326489925, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16313791275024414, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00043242922401987016, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.244272232055664, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.002046521520242095, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.805944442749023, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03385389596223831, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.60397720336914, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.032552123069763184, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.155214309692383, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.003344587981700897, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.59568977355957, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.029748469591140747, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.7452974319458, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.04264324530959129, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.31204605102539, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.05294265225529671, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.963422775268555, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.017350468784570694, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.676112651824951, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.004702881909906864, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15162453055381775, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0005959529662504792, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29327964782715, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000562163710128516, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.357379913330078, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.020424989983439445, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.128437042236328, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.017379822209477425, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.750246047973633, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0006959867314435542, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.318262100219727, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.006178280338644981, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.612203598022461, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.009032190777361393, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.142003059387207, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.017475394532084465, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.231471061706543, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.012112557888031006, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1755242347717285, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.011636406183242798, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20474447309970856, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0010570964077487588, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.283708572387695, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0008194710244424641, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.966854095458984, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.019648918882012367, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.303136825561523, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.01824212074279785, "pnorm/_forward_module.model.norm.weight": 31.27166748046875, "gnorm/_forward_module.model.norm.weight": 0.0029304521158337593, "pnorm/_forward_module.lm_head.weight": 230.62892150878906, "gnorm/_forward_module.lm_head.weight": 0.02676154486835003} +{"step": 1824522240, "pnorm/_forward_module.model.embeddings.weight": 141.2996826171875, "gnorm/_forward_module.model.embeddings.weight": 0.03983566537499428, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.740142822265625, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.001100412686355412, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.479997634887695, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005553606431931257, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.169002532958984, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.00610389607027173, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.740812301635742, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04693936929106712, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.718417167663574, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.04896404966711998, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.089797258377075, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.003791254013776779, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3467184007167816, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0003196002508047968, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.98870849609375, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006725366110913455, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.9896240234375, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.027348315343260765, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.106740951538086, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02541610784828663, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.217464447021484, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0007439829641953111, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.25283432006836, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.004918637219816446, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.038154602050781, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005676165223121643, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.176529884338379, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.029481803998351097, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.261994361877441, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.029859626665711403, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3915863037109375, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00465731043368578, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16434034705162048, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00025714622461237013, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.41158103942871, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0006501044845208526, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.744081497192383, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.025628097355365753, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.622915267944336, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.019386712461709976, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.695505142211914, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0008713708375580609, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.012391090393066, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007118835113942623, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.509132385253906, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.010006715543568134, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.661165237426758, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.025744199752807617, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.815516471862793, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.022235898301005363, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0186009407043457, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004147120285779238, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19214430451393127, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000302297092275694, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.599018096923828, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0006662093219347298, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373287200927734, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.024263620376586914, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.10149383544922, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.018454954028129578, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.22920799255371, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0006150074768811464, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.509422302246094, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004191779065877199, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.873003005981445, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005198404658585787, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.143115043640137, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.02076118439435959, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.457987785339355, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.0214578527957201, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.86264967918396, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003476841142401099, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16308249533176422, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00029747968073934317, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.243576049804688, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0016574787441641092, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.80425453186035, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.03159962221980095, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.603221893310547, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.025302501395344734, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.15671157836914, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0026982370764017105, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.59824275970459, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.027324222028255463, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.747446060180664, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03339790180325508, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.312973022460938, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.03763935714960098, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.96514892578125, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.016039595007896423, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6761817932128906, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.005279638338834047, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15164099633693695, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.000579073210246861, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29310417175293, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005118696135468781, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.356529235839844, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.019656851887702942, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12828826904297, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.017070874571800232, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.753463745117188, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0006981016485951841, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.320535659790039, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.005933473352342844, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.613935470581055, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.01071224082261324, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.145620346069336, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.015939878299832344, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.236308097839355, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.011309727095067501, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1758480072021484, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007823674939572811, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20476634800434113, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0010428635869175196, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.284658432006836, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0006302759284153581, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.967975616455078, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.018343886360526085, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.30419921875, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.01688198558986187, "pnorm/_forward_module.model.norm.weight": 31.281511306762695, "gnorm/_forward_module.model.norm.weight": 0.003244763473048806, "pnorm/_forward_module.lm_head.weight": 230.66163635253906, "gnorm/_forward_module.lm_head.weight": 0.024462036788463593} +{"step": 1845493760, "pnorm/_forward_module.model.embeddings.weight": 141.29591369628906, "gnorm/_forward_module.model.embeddings.weight": 0.04256848618388176, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.739164352416992, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0012569489190354943, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.482110977172852, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005833090748637915, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.17095947265625, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006384486798197031, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.738602638244629, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.0498042032122612, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.716216087341309, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.051699042320251465, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0897762775421143, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.00398486852645874, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.346973180770874, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.000329759088344872, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988679885864258, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007758596329949796, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.989097595214844, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.029185757040977478, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.10626792907715, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02868049591779709, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.216999053955078, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0008192642708308995, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.25362777709961, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005452392622828484, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.038776397705078, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.006461069453507662, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.175283432006836, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.030755365267395973, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.26074504852295, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.030430717393755913, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3916091918945312, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0062323445454239845, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16433653235435486, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003037193964701146, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.412553787231445, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0007618989911861718, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.744661331176758, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.027392804622650146, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.62322998046875, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.02108249068260193, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.695934295654297, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0013919677585363388, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.013832092285156, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.011270029470324516, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.51021957397461, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.01747334562242031, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.66041374206543, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.029461365193128586, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.814711570739746, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.021980686113238335, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.018744468688965, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0041197980754077435, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19211970269680023, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0003610026615206152, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.59978485107422, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0006461621378548443, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.37360954284668, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02411271259188652, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.101869583129883, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.018238037824630737, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.23008155822754, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0006399019621312618, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.51136302947998, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004264742136001587, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.874751091003418, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005513256415724754, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.142865180969238, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.020797323435544968, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.457807540893555, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02090618759393692, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.863379955291748, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0032838196493685246, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1631428599357605, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0003411423822399229, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.242813110351562, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001079728128388524, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.802478790283203, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.027443913742899895, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.60231590270996, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.02244594134390354, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.158018112182617, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0018748992588371038, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.600379943847656, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.016183866187930107, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.749288558959961, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.020664149895310402, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.313770294189453, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.03889453783631325, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.966736793518066, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.015846993774175644, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.676079273223877, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.003405660390853882, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15160751342773438, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0002514254301786423, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.292890548706055, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005343262455426157, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.355710983276367, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.019120758399367332, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.128087997436523, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.016291480511426926, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.756385803222656, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0005700993933714926, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.32247543334961, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00509980320930481, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.615431785583496, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.007201238069683313, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.148877143859863, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.014445850625634193, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.24058723449707, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.011308695189654827, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1764941215515137, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005021743942052126, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20482541620731354, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0003753769560717046, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.285297393798828, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005924890865571797, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.968677520751953, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.01789184845983982, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.304946899414062, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.016436470672488213, "pnorm/_forward_module.model.norm.weight": 31.289995193481445, "gnorm/_forward_module.model.norm.weight": 0.0026825186796486378, "pnorm/_forward_module.lm_head.weight": 230.68954467773438, "gnorm/_forward_module.lm_head.weight": 0.024597734212875366} +{"step": 1866465280, "pnorm/_forward_module.model.embeddings.weight": 141.29249572753906, "gnorm/_forward_module.model.embeddings.weight": 0.04143253341317177, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.73828125, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0011583642335608602, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.48376750946045, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005803941283375025, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.172517776489258, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.0064353663474321365, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.736664772033691, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04923977330327034, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.714289665222168, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.05024658143520355, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.08979868888855, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.004187328740954399, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3471689224243164, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0003325626312289387, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.98887825012207, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0007046294049359858, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.988815307617188, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.028525685891509056, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.10591697692871, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02677803859114647, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.216543197631836, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0007926668040454388, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.254283905029297, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005194379482418299, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.039275169372559, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.006216716021299362, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.17410945892334, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.03069375827908516, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.259580612182617, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.029683172702789307, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.391710042953491, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.0051818182691931725, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16435779631137848, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003182909858878702, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.413387298583984, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0006940275197848678, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.745153427124023, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.02619919367134571, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.62350845336914, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.01955167017877102, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.69618797302246, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0007661737618036568, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.014897346496582, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.00594059843569994, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.511032104492188, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.007651661057025194, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.659655570983887, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.025954559445381165, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.813899040222168, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.021725405007600784, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.018852949142456, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004460211843252182, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19212377071380615, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.000406297214794904, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.600383758544922, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0007003470091149211, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373781204223633, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.023987844586372375, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.102121353149414, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.018457001075148582, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.230846405029297, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0006039486033841968, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.5131254196167, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004225557669997215, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.87633991241455, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005305594764649868, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.142632484436035, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.022110039368271828, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.457642555236816, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.021136952564120293, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.863755226135254, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003206930821761489, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16314299404621124, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00018158108287025243, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.24213409423828, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0015581885818392038, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.800901412963867, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.02986026741564274, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.60154914855957, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.025984356179833412, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.159168243408203, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0024949251674115658, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.602022171020508, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.024622179567813873, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.750696182250977, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.033959612250328064, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.31443977355957, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.04188300669193268, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.968058586120605, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.016020121052861214, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6758313179016113, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0033554525580257177, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15156374871730804, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00019093253649771214, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.292654037475586, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0004909314448013902, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.35492706298828, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.01900440640747547, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12788963317871, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.016400251537561417, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.758869171142578, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0006179916090331972, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.32396125793457, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00536680594086647, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.616583824157715, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.009223243221640587, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.151694297790527, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.015306190587580204, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.24427318572998, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.011264491826295853, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.17695689201355, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.007859851233661175, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2048693150281906, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0009771387558430433, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.286069869995117, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0006112185074016452, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.969511032104492, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.017907897010445595, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.30577850341797, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.01636006310582161, "pnorm/_forward_module.model.norm.weight": 31.297231674194336, "gnorm/_forward_module.model.norm.weight": 0.0025962721556425095, "pnorm/_forward_module.lm_head.weight": 230.71287536621094, "gnorm/_forward_module.lm_head.weight": 0.024461310356855392} +{"step": 1887436800, "pnorm/_forward_module.model.embeddings.weight": 141.2894744873047, "gnorm/_forward_module.model.embeddings.weight": 0.038410086184740067, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.737607955932617, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0011010556481778622, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.485292434692383, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005399423651397228, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.173933029174805, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.005866445135325193, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.735048294067383, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04606407880783081, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.712690353393555, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.04670298099517822, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.089796543121338, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0035340816248208284, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34733110666275024, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0001416487357346341, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.988910675048828, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006837777909822762, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.988420486450195, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.0268191359937191, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.105558395385742, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.023748766630887985, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.21605110168457, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0006628381670452654, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.254873275756836, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.004872964695096016, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.039706230163574, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005515686701983213, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.17297649383545, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.02758113667368889, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.258466720581055, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.02822374925017357, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3916966915130615, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.00439113425090909, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16435880959033966, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0003131573321297765, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.41415786743164, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0006192057044245303, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.745594024658203, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.024236712604761124, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.623741149902344, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.018130887299776077, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.696517944335938, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0006528875092044473, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.015909194946289, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.005523327738046646, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.51181411743164, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.006866606418043375, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.659058570861816, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.02358577772974968, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.813279151916504, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.02105342596769333, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.018808126449585, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0038811315316706896, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19207394123077393, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0002908289898186922, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.60086441040039, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005657877190969884, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373884201049805, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.022515997290611267, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.102298736572266, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0172084029763937, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.231571197509766, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005581005825661123, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.514734268188477, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.003920732066035271, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.877795219421387, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.004956468939781189, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.142410278320312, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.019370459020137787, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.457494735717773, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.020080238580703735, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8643200397491455, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.00269776931963861, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16318635642528534, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002130301872966811, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.24159049987793, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0009428830235265195, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.79960823059082, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.025329465046525, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.600929260253906, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.02013438008725643, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.160198211669922, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.001545641804113984, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.603516578674316, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.01484636589884758, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.751952171325684, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.018429063260555267, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.315053939819336, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.03303760290145874, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.969220161437988, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.015426358208060265, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6757097244262695, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.004543732386082411, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15154647827148438, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0005338139017112553, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.292417526245117, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.00047669338528066874, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.354211807250977, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.018476102501153946, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127685546875, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.015696680173277855, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.760955810546875, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0005286791711114347, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.325220108032227, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004479008261114359, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.61754322052002, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006850536447018385, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.154020309448242, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.014215657487511635, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.247294425964355, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.010903509333729744, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.177316904067993, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005301160272210836, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2049180567264557, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0006440123543143272, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.286828994750977, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005753975710831583, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.970354080200195, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.01720154844224453, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.306499481201172, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.015896346420049667, "pnorm/_forward_module.model.norm.weight": 31.303287506103516, "gnorm/_forward_module.model.norm.weight": 0.0029082116670906544, "pnorm/_forward_module.lm_head.weight": 230.7323455810547, "gnorm/_forward_module.lm_head.weight": 0.024354036897420883} +{"step": 1908408320, "pnorm/_forward_module.model.embeddings.weight": 141.28689575195312, "gnorm/_forward_module.model.embeddings.weight": 0.03953370824456215, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.736919403076172, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0010687484173104167, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.486412048339844, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005557571072131395, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.174970626831055, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006145953666418791, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.733595848083496, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.046261388808488846, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.71125602722168, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.047859352082014084, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0898783206939697, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.003532474162057042, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34751713275909424, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00020237077842466533, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.98918914794922, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006750530446879566, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.988357543945312, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.02735157310962677, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.105392456054688, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.025260422378778458, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.215755462646484, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0007280920981429517, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.255487442016602, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.005033534485846758, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.040183067321777, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005853482987731695, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.172103881835938, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.028949473053216934, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.257623672485352, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.028736654669046402, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3917250633239746, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.005348841194063425, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.164349764585495, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0005423775292001665, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.41475486755371, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0006507952348329127, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.745882034301758, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.024886304512619972, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.623926162719727, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.019078295677900314, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.696744918823242, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0006928078946657479, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.016664505004883, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.005864372942596674, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.512378692626953, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.007441548630595207, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.658556938171387, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.025273337960243225, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.812748908996582, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.021473677828907967, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.018864154815674, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.003971713595092297, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19201518595218658, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0002042171108769253, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.601215362548828, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0006280054803937674, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.3738956451416, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02355886809527874, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.102399826049805, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.01826070249080658, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.232011795043945, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005944207077845931, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.515812873840332, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004173987545073032, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.878778457641602, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005175002850592136, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14216136932373, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.02065034955739975, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.45731258392334, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.020719099789857864, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8645925521850586, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.002876298502087593, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1632053554058075, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00019709061598405242, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.24113655090332, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.001564873498864472, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.798513412475586, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.02952931821346283, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.600412368774414, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.024951167404651642, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.161096572875977, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.002608047565445304, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.604578971862793, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.023041626438498497, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.752826690673828, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.03262857347726822, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.315567016601562, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.039842505007982254, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.970245361328125, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.01586170867085457, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6755521297454834, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0037819352000951767, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1514965146780014, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00025390394148416817, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.292238235473633, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005727115203626454, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.353572845458984, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.018960289657115936, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127498626708984, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.01617366075515747, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.762779235839844, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0006487798527814448, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.32638168334961, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.005244255065917969, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.61844253540039, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.007965208031237125, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.156042098999023, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.014728873036801815, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.249842643737793, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01116686686873436, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.177598714828491, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.005455356556922197, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2049495428800583, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.0005766593967564404, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.28728485107422, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0006553585990332067, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.970773696899414, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.01754908263683319, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.30698013305664, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.015997515991330147, "pnorm/_forward_module.model.norm.weight": 31.308271408081055, "gnorm/_forward_module.model.norm.weight": 0.002568097785115242, "pnorm/_forward_module.lm_head.weight": 230.74826049804688, "gnorm/_forward_module.lm_head.weight": 0.023320289328694344} +{"step": 1929379840, "pnorm/_forward_module.model.embeddings.weight": 141.28472900390625, "gnorm/_forward_module.model.embeddings.weight": 0.037811875343322754, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.736412048339844, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0010695364326238632, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.487442016601562, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005584117956459522, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.175923347473145, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006035377737134695, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.732439041137695, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.043960414826869965, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.710112571716309, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.04526115953922272, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.089857816696167, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0034811983350664377, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34762677550315857, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00034065634827129543, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.989423751831055, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.000641494058072567, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.9882869720459, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.02616344392299652, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.105236053466797, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.024222970008850098, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.215423583984375, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0007281634025275707, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.255863189697266, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0048576341941952705, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.040472984313965, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005774990655481815, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.171311378479004, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.02652198076248169, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.256855964660645, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.027268853038549423, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3918092250823975, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004387391731142998, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16434498131275177, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00016440726176369935, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.415210723876953, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0005874583730474114, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.746061325073242, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.023594066500663757, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.624046325683594, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.018327362835407257, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.696901321411133, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.000649038702249527, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.017169952392578, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.005972040351480246, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.512776374816895, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.007127071265131235, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.658122062683105, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.021895375102758408, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.812298774719238, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.020614417269825935, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0190632343292236, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0037377369590103626, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19200140237808228, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00025559987989254296, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.601449966430664, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005497109959833324, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373836517333984, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.022584570571780205, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.102441787719727, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.017512718215584755, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.23256492614746, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005541777354665101, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.517050743103027, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0041975416243076324, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.879876136779785, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.005072102416306734, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.142014503479004, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.019333001226186752, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.457207679748535, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.02001214772462845, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8648784160614014, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003437439911067486, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16320912539958954, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0003130416735075414, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.240787506103516, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0012160739861428738, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.79766845703125, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.02752852626144886, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.600019454956055, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.022973135113716125, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.161802291870117, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0021456927061080933, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.605635643005371, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.019093429669737816, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.753713607788086, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.024967225268483162, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.31593132019043, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.0371691957116127, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.970969200134277, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.015183072537183762, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6753432750701904, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0033712005242705345, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15146468579769135, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00035670027136802673, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29214096069336, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0005273482529446483, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.353111267089844, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.018417170271277428, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127351760864258, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.015371647663414478, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.764217376708984, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0005956166423857212, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.32723617553711, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004296624101698399, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.619091987609863, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006064938846975565, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.157685279846191, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.013773739337921143, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.251908302307129, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.010714484378695488, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1777138710021973, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0029773779679089785, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2049621194601059, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.000202669674763456, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.28781509399414, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005478968378156424, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.97138786315918, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.016816968098282814, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.307498931884766, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.015443393960595131, "pnorm/_forward_module.model.norm.weight": 31.3122501373291, "gnorm/_forward_module.model.norm.weight": 0.002716219983994961, "pnorm/_forward_module.lm_head.weight": 230.76097106933594, "gnorm/_forward_module.lm_head.weight": 0.022944768890738487} +{"step": 1950351360, "pnorm/_forward_module.model.embeddings.weight": 141.28298950195312, "gnorm/_forward_module.model.embeddings.weight": 0.03689458593726158, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.735950469970703, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0010095755569636822, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.488175392150879, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005546994041651487, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.176591873168945, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.006071293260902166, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.731477737426758, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04286693409085274, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.709162712097168, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.04412868618965149, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.08988881111145, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0038663099985569715, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.347726970911026, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00024316352210007608, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.989521026611328, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006276473286561668, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.988122940063477, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.025196384638547897, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.10506248474121, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.023078493773937225, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.215198516845703, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0006384507869370282, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.256060600280762, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.0046850242651999, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.04061222076416, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.00541264284402132, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.170764923095703, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.025776326656341553, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.256327629089355, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.026669232174754143, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3919599056243896, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004272155463695526, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16438321769237518, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0002818430948536843, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.415552139282227, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0005980193964205682, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.74616813659668, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.023367585614323616, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.624099731445312, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.017882652580738068, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.69710922241211, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0006666852859780192, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.017690658569336, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.006162821315228939, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.51318645477295, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.007843746803700924, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.657795906066895, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.021849652752280235, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.81196403503418, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.019917141646146774, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.019361972808838, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0032481651287525892, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19201330840587616, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00016115013568196446, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.601716995239258, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005583156598731875, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.37386703491211, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02190527319908142, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.1025333404541, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.0167732834815979, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.232892990112305, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005361203802749515, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.517765998840332, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0038794102147221565, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.880512237548828, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.004696629010140896, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.141852378845215, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.018421823158860207, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.457071304321289, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.019099151715636253, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8651885986328125, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0028199946973472834, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16323734819889069, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002422324614599347, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.24048614501953, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0014422446256503463, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.796964645385742, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.02793426252901554, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.59967041015625, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.021238919347524643, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.162437438964844, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0022198562510311604, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.606456756591797, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.02255948632955551, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.75438404083252, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.0318065844476223, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.3162841796875, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.026596615090966225, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.97163200378418, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.014549219980835915, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.675316095352173, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.00385885126888752, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15145714581012726, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00024153859703801572, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.292057037353516, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000486671895487234, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.35272216796875, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.017901292070746422, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.12723159790039, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.015091931447386742, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.76534652709961, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0006068425718694925, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.327873229980469, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004559563938528299, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.61958122253418, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006666353903710842, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.158982276916504, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.013149024918675423, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.253510475158691, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.01026394497603178, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.177889347076416, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.003234145464375615, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2049812525510788, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00021502295567188412, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.28818130493164, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005247893859632313, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.971681594848633, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.016296198591589928, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.307863235473633, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.014790337532758713, "pnorm/_forward_module.model.norm.weight": 31.315364837646484, "gnorm/_forward_module.model.norm.weight": 0.0026170366909354925, "pnorm/_forward_module.lm_head.weight": 230.7705535888672, "gnorm/_forward_module.lm_head.weight": 0.022315412759780884} +{"step": 1971322880, "pnorm/_forward_module.model.embeddings.weight": 141.28163146972656, "gnorm/_forward_module.model.embeddings.weight": 0.03714624419808388, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.73560333251953, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0009769797325134277, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.488700866699219, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005454860161989927, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.177069664001465, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.005985437426716089, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.73074722290039, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04250827431678772, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.708444595336914, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.043414708226919174, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.08992075920105, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.003588038496673107, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3478126525878906, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00031518060131929815, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.98958969116211, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006114484858699143, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.987985610961914, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.025437239557504654, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.104928970336914, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.021480852738022804, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.215068817138672, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0006723285769112408, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.256372451782227, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.004718279466032982, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.040858268737793, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005478980485349894, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.170361518859863, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.024720942601561546, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.255938529968262, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.0254165381193161, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.391916513442993, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004966350272297859, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16435495018959045, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0002777695772238076, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.415870666503906, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0005437562358565629, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.74631690979004, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.02275421842932701, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.624197006225586, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.01697017252445221, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.697269439697266, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0005996290710754693, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.018145561218262, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.00539540546014905, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.51353645324707, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.006449875887483358, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.657549858093262, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.020065657794475555, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.81170654296875, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.019062740728259087, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.019453763961792, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.004369224887341261, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19201131165027618, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.0002644858614075929, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.601911544799805, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005706973606720567, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373882293701172, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02140578255057335, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.102596282958984, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.016052158549427986, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.233156204223633, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005129198543727398, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.518380165100098, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0039009128231555223, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.881067276000977, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.0048246318474411964, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.141744613647461, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.017115961760282516, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.456989288330078, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.01855389028787613, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8652546405792236, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0027182616759091616, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16322094202041626, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00014290498802438378, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.240278244018555, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0008843920659273863, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.796438217163086, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.024233359843492508, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.599416732788086, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.017298690974712372, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.162927627563477, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0013226321898400784, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.607057571411133, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.013760835863649845, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.754888534545898, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.01792994514107704, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.316516876220703, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.025602849200367928, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.972087860107422, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.014185556210577488, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6753926277160645, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0030774488113820553, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15146328508853912, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00019492160936351866, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.291988372802734, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0004489283310249448, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.352413177490234, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.017554273828864098, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127134323120117, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.01489537674933672, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.76619529724121, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0005386365228332579, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.328340530395508, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004195576999336481, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.619928359985352, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006041112821549177, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.15998649597168, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.012838100083172321, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.25474739074707, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.010215546935796738, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1779093742370605, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.002948185196146369, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20497655868530273, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00018144310161005706, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.28835105895996, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005168091156519949, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.971784591674805, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.01634378731250763, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.308048248291016, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.015429461374878883, "pnorm/_forward_module.model.norm.weight": 31.31772232055664, "gnorm/_forward_module.model.norm.weight": 0.0031253325287252665, "pnorm/_forward_module.lm_head.weight": 230.77780151367188, "gnorm/_forward_module.lm_head.weight": 0.02285950817167759} +{"step": 1992294400, "pnorm/_forward_module.model.embeddings.weight": 141.28062438964844, "gnorm/_forward_module.model.embeddings.weight": 0.03691533952951431, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.735363006591797, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0010218878742307425, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.489118576049805, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005533520597964525, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.177452087402344, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.005981964059174061, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.730216979980469, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04156123846769333, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.707921981811523, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.0429183728992939, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.08988094329834, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.003625147510319948, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3478550612926483, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0002337155310669914, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.9896183013916, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0005946115124970675, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.98785400390625, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.02489541657269001, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.10481071472168, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.021443499252200127, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.21493911743164, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0005932141211815178, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.256562232971191, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.004671309143304825, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.041007995605469, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005307964980602264, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.170026779174805, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.024400318041443825, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.255615234375, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.024956567212939262, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3919053077697754, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004353972151875496, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16434577107429504, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00038962741382420063, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.416120529174805, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0006012835074216127, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.746429443359375, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.022670894861221313, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.624267578125, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.016824040561914444, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.697389602661133, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0006034219986759126, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.01845932006836, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.005383907351642847, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.513777732849121, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.006342597771435976, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.657366752624512, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.019772473722696304, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.811516761779785, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.01903282292187214, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.019543170928955, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.003483912907540798, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19200877845287323, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00020665193733293563, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.60200309753418, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005335774621926248, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.37383270263672, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02130788564682007, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.102598190307617, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.016015876084566116, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.233381271362305, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005566891049966216, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.518847465515137, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.00392846018075943, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.881484985351562, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.004835184197872877, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.14169979095459, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.016986915841698647, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.456954002380371, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.018747584894299507, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.865299701690674, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0028085445519536734, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16320861876010895, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.0002012563491007313, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.240087509155273, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0008585589239373803, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.796003341674805, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.024222593754529953, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.599185943603516, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.01762247644364834, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.163305282592773, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.001365387230180204, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.607503890991211, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.013698899187147617, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.755261421203613, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.01787969283759594, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.316720962524414, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.026759367436170578, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.972454071044922, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.014111301861703396, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6753835678100586, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0032405098900198936, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15145444869995117, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00018720829393714666, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29191780090332, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.00046834253589622676, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.352161407470703, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.01743607223033905, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127056121826172, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.014688883908092976, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.766857147216797, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.00047966104466468096, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.328662872314453, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004282352514564991, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.620160102844238, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006181922275573015, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.160760879516602, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.012311853468418121, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.255694389343262, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.010091355070471764, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.177948236465454, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0033693797886371613, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.2049739509820938, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00028901922632940114, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.28852653503418, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005188643117435277, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.971920013427734, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.01627880148589611, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.308177947998047, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.015148650854825974, "pnorm/_forward_module.model.norm.weight": 31.319425582885742, "gnorm/_forward_module.model.norm.weight": 0.002887467620894313, "pnorm/_forward_module.lm_head.weight": 230.78306579589844, "gnorm/_forward_module.lm_head.weight": 0.02226090244948864} +{"step": 2013265920, "pnorm/_forward_module.model.embeddings.weight": 141.27993774414062, "gnorm/_forward_module.model.embeddings.weight": 0.03747823089361191, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.735191345214844, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0010185347637161613, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.489347457885742, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005549980327486992, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.17765998840332, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.005943977274000645, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.72985553741455, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.042497336864471436, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.707566261291504, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.04435905069112778, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0899100303649902, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0036047901958227158, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.34790655970573425, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00010910534911090508, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.989688873291016, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006407328182831407, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.987812042236328, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.025843758136034012, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.104753494262695, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.0230922419577837, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.21483039855957, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0006017693085595965, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.256625175476074, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.00464570801705122, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.041040420532227, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005252313334494829, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.169785499572754, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.02503127232193947, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.255382537841797, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.026073075830936432, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.391916513442993, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004365789238363504, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16436320543289185, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.0002985032624565065, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.416290283203125, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0006468048668466508, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.74650764465332, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.02373587153851986, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.624311447143555, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.01805637590587139, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.697460174560547, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.000792437931522727, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.018672943115234, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.007164910435676575, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.513936996459961, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.009789429605007172, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.657228469848633, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.021507184952497482, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.811369895935059, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.019680479541420937, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0195679664611816, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.0035784225910902023, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.192015141248703, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00021380592079367489, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.602100372314453, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005845123087055981, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373828887939453, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.022169306874275208, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.102611541748047, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.01663539744913578, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.23349952697754, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005546119064092636, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.519099235534668, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.004032221622765064, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.881718635559082, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.004873660393059254, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.141646385192871, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.017235038802027702, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.4569091796875, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.019076529890298843, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.8653509616851807, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.002921423641964793, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.1632089465856552, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00024164613569155335, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.239944458007812, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.000977479387074709, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.795679092407227, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.026567259803414345, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.59900665283203, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.017953520640730858, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.163557052612305, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0014619502471759915, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.60777473449707, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.014479709789156914, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.75548267364502, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.020168529823422432, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.316859245300293, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.026882417500019073, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.972685813903809, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.014439310878515244, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.675394296646118, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0035303891636431217, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1514485776424408, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.0002780020877253264, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.291885375976562, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.000471546285552904, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.35200309753418, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.017672430723905563, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.127010345458984, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.014885174110531807, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.767290115356445, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0005472481134347618, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.328874588012695, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.0042878263629972935, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.620318412780762, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.00602493342012167, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.161267280578613, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.012803500518202782, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.256308555603027, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.010325102135539055, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.1780014038085938, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0032446940895169973, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20497965812683105, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00023066364519763738, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.288665771484375, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005459982203319669, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.972036361694336, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.016636217013001442, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.30829620361328, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.015072420239448547, "pnorm/_forward_module.model.norm.weight": 31.32056999206543, "gnorm/_forward_module.model.norm.weight": 0.00280367280356586, "pnorm/_forward_module.lm_head.weight": 230.78652954101562, "gnorm/_forward_module.lm_head.weight": 0.02283090353012085} +{"step": 2034237440, "pnorm/_forward_module.model.embeddings.weight": 141.27951049804688, "gnorm/_forward_module.model.embeddings.weight": 0.035891737788915634, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.73508644104004, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0009865817846730351, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.489511489868164, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005259076599031687, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.177809715270996, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.005680103786289692, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.729622840881348, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.041164278984069824, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.7073392868042, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.04215937480330467, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.0899078845977783, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0034073118586093187, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3479275107383728, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.00028215604834258556, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.98969841003418, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006485178018920124, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.987756729125977, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.024631304666399956, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.104698181152344, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.021275196224451065, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.214780807495117, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0006411009235307574, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.256712913513184, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.004607220180332661, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.041107177734375, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.005223445128649473, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.169647216796875, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.024506401270627975, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.255249977111816, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.024652449414134026, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3919036388397217, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004020797088742256, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16436688601970673, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00019955966854467988, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.4163818359375, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0005308366962708533, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.746540069580078, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.022243579849600792, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.624326705932617, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.0165592972189188, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.697532653808594, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0006082403124310076, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.01883316040039, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.005449639167636633, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.514059066772461, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.0068230582401156425, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.657176971435547, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.020336279645562172, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.811315536499023, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.018596498295664787, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.0195512771606445, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.003400309942662716, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19201135635375977, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00015773254563100636, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.60215950012207, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.0005381385562941432, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.37382698059082, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02075265906751156, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.10262107849121, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.01563592255115509, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.23358154296875, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.0005324099329300225, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.519268035888672, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.0038481152150779963, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.881865501403809, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.004637409467250109, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.141616821289062, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.016603710129857063, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.456888198852539, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.017923688516020775, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.865384817123413, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.003037144662812352, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.16320641338825226, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00030970873194746673, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.239877700805664, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0009621047647669911, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.795513153076172, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.023897208273410797, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.598926544189453, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.017309844493865967, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.163728713989258, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0015076800482347608, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.607951164245605, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.014133074320852757, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.75562858581543, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.020300615578889847, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.3169584274292, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.024503719061613083, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.972844123840332, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.013736321590840816, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6754074096679688, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0033214977011084557, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.1514497548341751, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00027573955594561994, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.29184913635254, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.0004525565600488335, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.351886749267578, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.017183246091008186, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.126968383789062, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.0145339360460639, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.767576217651367, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.0004677878168877214, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.329041481018066, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.00419237045571208, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.620448112487793, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.006332274992018938, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.16159439086914, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.011935897171497345, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.256695747375488, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.009929482825100422, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.178053617477417, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0032545761205255985, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20498211681842804, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00024216128804255277, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.288719177246094, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.0005303456564433873, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.972063064575195, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.01603449136018753, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.3083438873291, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.014619146473705769, "pnorm/_forward_module.model.norm.weight": 31.321271896362305, "gnorm/_forward_module.model.norm.weight": 0.0029856753535568714, "pnorm/_forward_module.lm_head.weight": 230.78866577148438, "gnorm/_forward_module.lm_head.weight": 0.022005567327141762} +{"step": 2055208960, "pnorm/_forward_module.model.embeddings.weight": 141.27926635742188, "gnorm/_forward_module.model.embeddings.weight": 0.035828955471515656, "pnorm/_forward_module.model.layers.0.attn_norm.weight": 22.735036849975586, "gnorm/_forward_module.model.layers.0.attn_norm.weight": 0.0009216683683916926, "pnorm/_forward_module.model.layers.0.attn.q_proj.weight": 15.489602088928223, "gnorm/_forward_module.model.layers.0.attn.q_proj.weight": 0.005290591157972813, "pnorm/_forward_module.model.layers.0.attn.k_proj.weight": 15.177895545959473, "gnorm/_forward_module.model.layers.0.attn.k_proj.weight": 0.005758250132203102, "pnorm/_forward_module.model.layers.0.attn.v_proj.weight": 10.729506492614746, "gnorm/_forward_module.model.layers.0.attn.v_proj.weight": 0.04101638123393059, "pnorm/_forward_module.model.layers.0.attn.o_proj.weight": 10.707225799560547, "gnorm/_forward_module.model.layers.0.attn.o_proj.weight": 0.041912343353033066, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 2.089909553527832, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.weight": 0.0034380306024104357, "pnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.3479435443878174, "gnorm/_forward_module.model.layers.0.attn.fgate_proj.bias": 0.0001478599151596427, "pnorm/_forward_module.model.layers.0.mlp_norm.weight": 21.989715576171875, "gnorm/_forward_module.model.layers.0.mlp_norm.weight": 0.0006354718352667987, "pnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 26.987735748291016, "gnorm/_forward_module.model.layers.0.mlp.gate_proj.weight": 0.024493178352713585, "pnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 19.10467529296875, "gnorm/_forward_module.model.layers.0.mlp.down_proj.weight": 0.02171187847852707, "pnorm/_forward_module.model.layers.1.attn_norm.weight": 22.214744567871094, "gnorm/_forward_module.model.layers.1.attn_norm.weight": 0.0006052698590792716, "pnorm/_forward_module.model.layers.1.attn.q_proj.weight": 12.25674057006836, "gnorm/_forward_module.model.layers.1.attn.q_proj.weight": 0.004515502601861954, "pnorm/_forward_module.model.layers.1.attn.k_proj.weight": 12.041125297546387, "gnorm/_forward_module.model.layers.1.attn.k_proj.weight": 0.0051620532758533955, "pnorm/_forward_module.model.layers.1.attn.v_proj.weight": 10.169564247131348, "gnorm/_forward_module.model.layers.1.attn.v_proj.weight": 0.02396375499665737, "pnorm/_forward_module.model.layers.1.attn.o_proj.weight": 10.255170822143555, "gnorm/_forward_module.model.layers.1.attn.o_proj.weight": 0.024468552321195602, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 2.3919155597686768, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.weight": 0.004134598653763533, "pnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.16437029838562012, "gnorm/_forward_module.model.layers.1.attn.fgate_proj.bias": 0.00021656013268511742, "pnorm/_forward_module.model.layers.1.mlp_norm.weight": 22.416425704956055, "gnorm/_forward_module.model.layers.1.mlp_norm.weight": 0.0005786814726889133, "pnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 27.746551513671875, "gnorm/_forward_module.model.layers.1.mlp.gate_proj.weight": 0.022026963531970978, "pnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 19.624332427978516, "gnorm/_forward_module.model.layers.1.mlp.down_proj.weight": 0.016732778400182724, "pnorm/_forward_module.model.layers.2.attn_norm.weight": 22.697553634643555, "gnorm/_forward_module.model.layers.2.attn_norm.weight": 0.0005977341788820922, "pnorm/_forward_module.model.layers.2.attn.q_proj.weight": 13.018892288208008, "gnorm/_forward_module.model.layers.2.attn.q_proj.weight": 0.005284285172820091, "pnorm/_forward_module.model.layers.2.attn.k_proj.weight": 12.514102935791016, "gnorm/_forward_module.model.layers.2.attn.k_proj.weight": 0.006200190167874098, "pnorm/_forward_module.model.layers.2.attn.v_proj.weight": 10.657135963439941, "gnorm/_forward_module.model.layers.2.attn.v_proj.weight": 0.02030697837471962, "pnorm/_forward_module.model.layers.2.attn.o_proj.weight": 10.811271667480469, "gnorm/_forward_module.model.layers.2.attn.o_proj.weight": 0.018755359575152397, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 3.019561529159546, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.weight": 0.003117887070402503, "pnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.19201229512691498, "gnorm/_forward_module.model.layers.2.attn.fgate_proj.bias": 0.00017809169366955757, "pnorm/_forward_module.model.layers.2.mlp_norm.weight": 22.6021728515625, "gnorm/_forward_module.model.layers.2.mlp_norm.weight": 0.000598133250605315, "pnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 28.373804092407227, "gnorm/_forward_module.model.layers.2.mlp.gate_proj.weight": 0.02100536786019802, "pnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 20.10261344909668, "gnorm/_forward_module.model.layers.2.mlp.down_proj.weight": 0.015991073101758957, "pnorm/_forward_module.model.layers.3.attn_norm.weight": 23.23363494873047, "gnorm/_forward_module.model.layers.3.attn_norm.weight": 0.00048346578842028975, "pnorm/_forward_module.model.layers.3.attn.q_proj.weight": 14.519370079040527, "gnorm/_forward_module.model.layers.3.attn.q_proj.weight": 0.003839048556983471, "pnorm/_forward_module.model.layers.3.attn.k_proj.weight": 13.881957054138184, "gnorm/_forward_module.model.layers.3.attn.k_proj.weight": 0.00462738424539566, "pnorm/_forward_module.model.layers.3.attn.v_proj.weight": 11.141602516174316, "gnorm/_forward_module.model.layers.3.attn.v_proj.weight": 0.017052920535206795, "pnorm/_forward_module.model.layers.3.attn.o_proj.weight": 11.456875801086426, "gnorm/_forward_module.model.layers.3.attn.o_proj.weight": 0.018348300829529762, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 2.865417242050171, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.weight": 0.0024783355183899403, "pnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.163208469748497, "gnorm/_forward_module.model.layers.3.attn.fgate_proj.bias": 0.00015571669791825116, "pnorm/_forward_module.model.layers.3.mlp_norm.weight": 22.239831924438477, "gnorm/_forward_module.model.layers.3.mlp_norm.weight": 0.0013289490016177297, "pnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 27.7954158782959, "gnorm/_forward_module.model.layers.3.mlp.gate_proj.weight": 0.02637249417603016, "pnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 19.598878860473633, "gnorm/_forward_module.model.layers.3.mlp.down_proj.weight": 0.019196845591068268, "pnorm/_forward_module.model.layers.4.attn_norm.weight": 24.1638240814209, "gnorm/_forward_module.model.layers.4.attn_norm.weight": 0.0019720392301678658, "pnorm/_forward_module.model.layers.4.attn.q_proj.weight": 14.608060836791992, "gnorm/_forward_module.model.layers.4.attn.q_proj.weight": 0.018622737377882004, "pnorm/_forward_module.model.layers.4.attn.k_proj.weight": 13.755719184875488, "gnorm/_forward_module.model.layers.4.attn.k_proj.weight": 0.027003159746527672, "pnorm/_forward_module.model.layers.4.attn.v_proj.weight": 12.317009925842285, "gnorm/_forward_module.model.layers.4.attn.v_proj.weight": 0.024990776553750038, "pnorm/_forward_module.model.layers.4.attn.o_proj.weight": 13.972935676574707, "gnorm/_forward_module.model.layers.4.attn.o_proj.weight": 0.014027920551598072, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 2.6753878593444824, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.weight": 0.0034933581482619047, "pnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.15144643187522888, "gnorm/_forward_module.model.layers.4.attn.fgate_proj.bias": 0.00029174931114539504, "pnorm/_forward_module.model.layers.4.mlp_norm.weight": 23.291828155517578, "gnorm/_forward_module.model.layers.4.mlp_norm.weight": 0.00044949696166440845, "pnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 30.35182762145996, "gnorm/_forward_module.model.layers.4.mlp.gate_proj.weight": 0.01763206347823143, "pnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 21.1269474029541, "gnorm/_forward_module.model.layers.4.mlp.down_proj.weight": 0.014681817963719368, "pnorm/_forward_module.model.layers.5.attn_norm.weight": 23.767732620239258, "gnorm/_forward_module.model.layers.5.attn_norm.weight": 0.00047209148760885, "pnorm/_forward_module.model.layers.5.attn.q_proj.weight": 14.329133033752441, "gnorm/_forward_module.model.layers.5.attn.q_proj.weight": 0.004112453665584326, "pnorm/_forward_module.model.layers.5.attn.k_proj.weight": 13.62052059173584, "gnorm/_forward_module.model.layers.5.attn.k_proj.weight": 0.005898597184568644, "pnorm/_forward_module.model.layers.5.attn.v_proj.weight": 12.161765098571777, "gnorm/_forward_module.model.layers.5.attn.v_proj.weight": 0.012562797404825687, "pnorm/_forward_module.model.layers.5.attn.o_proj.weight": 13.256896018981934, "gnorm/_forward_module.model.layers.5.attn.o_proj.weight": 0.009957646019756794, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 3.178103446960449, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.weight": 0.0027153410483151674, "pnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.20498774945735931, "gnorm/_forward_module.model.layers.5.attn.fgate_proj.bias": 0.00013712180953007191, "pnorm/_forward_module.model.layers.5.mlp_norm.weight": 23.28875732421875, "gnorm/_forward_module.model.layers.5.mlp_norm.weight": 0.000552800833247602, "pnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 30.972087860107422, "gnorm/_forward_module.model.layers.5.mlp.gate_proj.weight": 0.01612411066889763, "pnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 21.30837631225586, "gnorm/_forward_module.model.layers.5.mlp.down_proj.weight": 0.014723340049386024, "pnorm/_forward_module.model.norm.weight": 31.321638107299805, "gnorm/_forward_module.model.norm.weight": 0.0027449321933090687, "pnorm/_forward_module.lm_head.weight": 230.78976440429688, "gnorm/_forward_module.lm_head.weight": 0.022977465763688087} diff --git a/metrics/jsonlines/resume.jsonl b/metrics/jsonlines/resume.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92733e5e5f3807c0d022c001b1110b1f10275c5c --- /dev/null +++ b/metrics/jsonlines/resume.jsonl @@ -0,0 +1 @@ +{"step": 0, "resume/resume_step": 0} diff --git a/metrics/jsonlines/throughput.jsonl b/metrics/jsonlines/throughput.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d45954ee7649d7277f08f7d702bad57bf84c0a5f --- /dev/null +++ b/metrics/jsonlines/throughput.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "throughput/token_count": 20971520, "throughput/batch_count": 10, "throughput/flop_count": 0, "throughput/total_time": 97.20068372006062, "throughput/update_time": 96.98365108307917, "throughput/token_count_per_second_total_recent": 224397.03654632583, "throughput/token_count_per_second_total_cum": 215754.86094725714, "throughput/token_count_per_second_update_recent": 224813.24140552053, "throughput/token_count_per_second_update_cum": 216237.68300942963, "throughput/batch_count_per_second_total_recent": 0.1070008452159528, "throughput/batch_count_per_second_total_cum": 0.10287993476260049, "throughput/batch_count_per_second_update_recent": 0.10719930715824152, "throughput/batch_count_per_second_update_cum": 0.10311016226264459, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 41943040, "throughput/token_count": 41943040, "throughput/batch_count": 20, "throughput/flop_count": 0, "throughput/total_time": 190.58031271304935, "throughput/update_time": 190.21444558817893, "throughput/token_count_per_second_total_recent": 224495.1145840076, "throughput/token_count_per_second_total_cum": 220080.6547271873, "throughput/token_count_per_second_update_recent": 224880.9829341608, "throughput/token_count_per_second_update_cum": 220503.9678784869, "throughput/batch_count_per_second_total_recent": 0.1070476124687231, "throughput/batch_count_per_second_total_cum": 0.10494263397559514, "throughput/batch_count_per_second_update_recent": 0.10723160883625069, "throughput/batch_count_per_second_update_cum": 0.10514448541569085, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 62914560, "throughput/token_count": 62914560, "throughput/batch_count": 30, "throughput/flop_count": 0, "throughput/total_time": 454.2337091190275, "throughput/update_time": 283.44298765144777, "throughput/token_count_per_second_total_recent": 137862.77863624124, "throughput/token_count_per_second_total_cum": 138507.02564990363, "throughput/token_count_per_second_update_recent": 224903.88780605304, "throughput/token_count_per_second_update_cum": 221965.4842100612, "throughput/batch_count_per_second_total_recent": 0.06573809558689177, "throughput/batch_count_per_second_total_cum": 0.06604529650206739, "throughput/batch_count_per_second_update_recent": 0.10724253073027279, "throughput/batch_count_per_second_update_cum": 0.1058413907099062, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 83886080, "throughput/token_count": 83886080, "throughput/batch_count": 40, "throughput/flop_count": 0, "throughput/total_time": 547.6119900090853, "throughput/update_time": 376.67628215253353, "throughput/token_count_per_second_total_recent": 153012.98525494087, "throughput/token_count_per_second_total_cum": 153185.25074406838, "throughput/token_count_per_second_update_recent": 224912.10891677192, "throughput/token_count_per_second_update_cum": 222700.72201156183, "throughput/batch_count_per_second_total_recent": 0.07296227705714267, "throughput/batch_count_per_second_total_cum": 0.07304441964343471, "throughput/batch_count_per_second_update_recent": 0.10724645086134525, "throughput/batch_count_per_second_update_cum": 0.10619197941377727, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 104857600, "throughput/token_count": 104857600, "throughput/batch_count": 50, "throughput/flop_count": 0, "throughput/total_time": 811.2910252050497, "throughput/update_time": 469.82893142174, "throughput/token_count_per_second_total_recent": 128739.93089858611, "throughput/token_count_per_second_total_cum": 129247.8244456085, "throughput/token_count_per_second_update_recent": 224956.68237346516, "throughput/token_count_per_second_update_cum": 223182.50960555475, "throughput/batch_count_per_second_total_recent": 0.06138798279694849, "throughput/batch_count_per_second_total_cum": 0.061630165312580344, "throughput/batch_count_per_second_update_recent": 0.10726770514176615, "throughput/batch_count_per_second_update_cum": 0.10642171364095437, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 125829120, "throughput/token_count": 125829120, "throughput/batch_count": 60, "throughput/flop_count": 0, "throughput/total_time": 904.5979733880376, "throughput/update_time": 562.9825718456414, "throughput/token_count_per_second_total_recent": 138789.40005912748, "throughput/token_count_per_second_total_cum": 139099.49358909758, "throughput/token_count_per_second_update_recent": 224985.75040324812, "throughput/token_count_per_second_update_cum": 223504.46762053558, "throughput/batch_count_per_second_total_recent": 0.06617994311291098, "throughput/batch_count_per_second_total_cum": 0.06632780723051909, "throughput/batch_count_per_second_update_recent": 0.10728156585848242, "throughput/batch_count_per_second_update_cum": 0.10657523518587855, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 146800640, "throughput/token_count": 146800640, "throughput/batch_count": 70, "throughput/flop_count": 0, "throughput/total_time": 1167.0270900100004, "throughput/update_time": 656.2661910566967, "throughput/token_count_per_second_total_recent": 125399.71958407477, "throughput/token_count_per_second_total_cum": 125790.25907508458, "throughput/token_count_per_second_update_recent": 224960.93056840677, "throughput/token_count_per_second_update_cum": 223690.69441719493, "throughput/batch_count_per_second_total_recent": 0.059795245925938975, "throughput/batch_count_per_second_total_cum": 0.05998146966699819, "throughput/batch_count_per_second_update_recent": 0.1072697308389696, "throughput/batch_count_per_second_update_cum": 0.10666403504237887, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 167772160, "throughput/token_count": 167772160, "throughput/batch_count": 80, "throughput/flop_count": 0, "throughput/total_time": 1260.3716635780875, "throughput/update_time": 749.4701656188117, "throughput/token_count_per_second_total_recent": 132828.77988922756, "throughput/token_count_per_second_total_cum": 133113.24337752024, "throughput/token_count_per_second_update_recent": 224966.72484440423, "throughput/token_count_per_second_update_cum": 223854.35431105693, "throughput/batch_count_per_second_total_recent": 0.06333769792996767, "throughput/batch_count_per_second_total_cum": 0.0634733406913377, "throughput/batch_count_per_second_update_recent": 0.10727249376507007, "throughput/batch_count_per_second_update_cum": 0.10674207416107985, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 188743680, "throughput/token_count": 188743680, "throughput/batch_count": 90, "throughput/flop_count": 0, "throughput/total_time": 1522.4740127840778, "throughput/update_time": 842.772364483797, "throughput/token_count_per_second_total_recent": 123657.3511580901, "throughput/token_count_per_second_total_cum": 123971.69240009107, "throughput/token_count_per_second_update_recent": 224944.58537872275, "throughput/token_count_per_second_update_cum": 223955.70613614816, "throughput/batch_count_per_second_total_recent": 0.05896441991714959, "throughput/batch_count_per_second_total_cum": 0.05911430950169137, "throughput/batch_count_per_second_update_recent": 0.10726193684517038, "throughput/batch_count_per_second_update_cum": 0.10679040247733505, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 209715200, "throughput/token_count": 209715200, "throughput/batch_count": 100, "throughput/flop_count": 0, "throughput/total_time": 1615.8070381759899, "throughput/update_time": 935.9601778858341, "throughput/token_count_per_second_total_recent": 129541.23481597217, "throughput/token_count_per_second_total_cum": 129789.75524004266, "throughput/token_count_per_second_update_recent": 224954.7984268493, "throughput/token_count_per_second_update_cum": 224064.23366612557, "throughput/batch_count_per_second_total_recent": 0.06177007427977189, "throughput/batch_count_per_second_total_cum": 0.061888578052541096, "throughput/batch_count_per_second_update_recent": 0.10726680680601564, "throughput/batch_count_per_second_update_cum": 0.10684215243631628, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 230686720, "throughput/token_count": 230686720, "throughput/batch_count": 110, "throughput/flop_count": 0, "throughput/total_time": 1880.450380234979, "throughput/update_time": 1029.0689218619373, "throughput/token_count_per_second_total_recent": 117039.83517592124, "throughput/token_count_per_second_total_cum": 122676.3133048869, "throughput/token_count_per_second_update_recent": 224994.82550826654, "throughput/token_count_per_second_update_cum": 224170.33018800034, "throughput/batch_count_per_second_total_recent": 0.05580894240184843, "throughput/batch_count_per_second_total_cum": 0.058496624615138486, "throughput/batch_count_per_second_update_recent": 0.1072858932057698, "throughput/batch_count_per_second_update_cum": 0.10689274320030229, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 251658240, "throughput/token_count": 251658240, "throughput/batch_count": 120, "throughput/flop_count": 0, "throughput/total_time": 1973.9993880910333, "throughput/update_time": 1122.4686564019648, "throughput/token_count_per_second_total_recent": 129451.19744271546, "throughput/token_count_per_second_total_cum": 127486.48328780256, "throughput/token_count_per_second_update_recent": 224947.15994560035, "throughput/token_count_per_second_update_cum": 224200.68352436845, "throughput/batch_count_per_second_total_recent": 0.061727141114576084, "throughput/batch_count_per_second_total_cum": 0.06079029240026596, "throughput/batch_count_per_second_update_recent": 0.1072631644943239, "throughput/batch_count_per_second_update_cum": 0.10690721679895804, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 272629760, "throughput/token_count": 272629760, "throughput/batch_count": 130, "throughput/flop_count": 0, "throughput/total_time": 2237.285542534082, "throughput/update_time": 1215.6973058142466, "throughput/token_count_per_second_total_recent": 117053.42280720641, "throughput/token_count_per_second_total_cum": 121857.38244712536, "throughput/token_count_per_second_update_recent": 224955.3087570142, "throughput/token_count_per_second_update_cum": 224257.92892367952, "throughput/batch_count_per_second_total_recent": 0.055815421489337165, "throughput/batch_count_per_second_total_cum": 0.05810612795215862, "throughput/batch_count_per_second_update_recent": 0.10726705015040121, "throughput/batch_count_per_second_update_cum": 0.10693451353248573, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 293601280, "throughput/token_count": 293601280, "throughput/batch_count": 140, "throughput/flop_count": 0, "throughput/total_time": 2330.883187837084, "throughput/update_time": 1309.140196379507, "throughput/token_count_per_second_total_recent": 129470.82467013586, "throughput/token_count_per_second_total_cum": 125961.38731106638, "throughput/token_count_per_second_update_recent": 224895.49327464707, "throughput/token_count_per_second_update_cum": 224270.3117755983, "throughput/batch_count_per_second_total_recent": 0.06173650010592263, "throughput/batch_count_per_second_total_cum": 0.060063069968732063, "throughput/batch_count_per_second_update_recent": 0.10723852790577272, "throughput/batch_count_per_second_update_cum": 0.10694041813640513, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 314572800, "throughput/token_count": 314572800, "throughput/batch_count": 150, "throughput/flop_count": 0, "throughput/total_time": 2593.152709970018, "throughput/update_time": 1402.5526540756691, "throughput/token_count_per_second_total_recent": 117131.1587658747, "throughput/token_count_per_second_total_cum": 121309.01461782292, "throughput/token_count_per_second_update_recent": 224836.79785557796, "throughput/token_count_per_second_update_cum": 224285.91118193304, "throughput/batch_count_per_second_total_recent": 0.055852488882958747, "throughput/batch_count_per_second_total_cum": 0.05784464579478403, "throughput/batch_count_per_second_update_recent": 0.10721053974894426, "throughput/batch_count_per_second_update_cum": 0.10694785651299145, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 335544320, "throughput/token_count": 335544320, "throughput/batch_count": 160, "throughput/flop_count": 0, "throughput/total_time": 2686.8860215520253, "throughput/update_time": 1496.1389014086453, "throughput/token_count_per_second_total_recent": 129440.33024567351, "throughput/token_count_per_second_total_cum": 124882.2306969983, "throughput/token_count_per_second_update_recent": 224728.4823117886, "throughput/token_count_per_second_update_cum": 224273.50808409444, "throughput/batch_count_per_second_total_recent": 0.061721959231220966, "throughput/batch_count_per_second_total_cum": 0.05954848799562373, "throughput/batch_count_per_second_update_recent": 0.10715889087285452, "throughput/batch_count_per_second_update_cum": 0.10694194225506518, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 356515840, "throughput/token_count": 356515840, "throughput/batch_count": 170, "throughput/flop_count": 0, "throughput/total_time": 2949.368387905997, "throughput/update_time": 1589.4967352375388, "throughput/token_count_per_second_total_recent": 117100.9105873088, "throughput/token_count_per_second_total_cum": 120878.70795045726, "throughput/token_count_per_second_update_recent": 224720.04403586246, "throughput/token_count_per_second_update_cum": 224294.79224235166, "throughput/batch_count_per_second_total_recent": 0.05583806542745056, "throughput/batch_count_per_second_total_cum": 0.05763945958636153, "throughput/batch_count_per_second_update_recent": 0.10715486718934177, "throughput/batch_count_per_second_update_cum": 0.10695209133260329, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 377487360, "throughput/token_count": 377487360, "throughput/batch_count": 180, "throughput/flop_count": 0, "throughput/total_time": 3042.959401597036, "throughput/update_time": 1682.9467192575103, "throughput/token_count_per_second_total_recent": 129387.89310020683, "throughput/token_count_per_second_total_cum": 124052.70993818824, "throughput/token_count_per_second_update_recent": 224649.77345095368, "throughput/token_count_per_second_update_cum": 224301.43252933255, "throughput/batch_count_per_second_total_recent": 0.06169695525179235, "throughput/batch_count_per_second_total_cum": 0.059152941674322244, "throughput/batch_count_per_second_update_recent": 0.10712135956332859, "throughput/batch_count_per_second_update_cum": 0.10695525766817691, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 398458880, "throughput/token_count": 398458880, "throughput/batch_count": 190, "throughput/flop_count": 0, "throughput/total_time": 3305.444267996005, "throughput/update_time": 1776.3336466513574, "throughput/token_count_per_second_total_recent": 117061.56849259966, "throughput/token_count_per_second_total_cum": 120546.24059402886, "throughput/token_count_per_second_update_recent": 224647.5927754134, "throughput/token_count_per_second_update_cum": 224315.33667740398, "throughput/batch_count_per_second_total_recent": 0.0558193056548117, "throughput/batch_count_per_second_total_cum": 0.05748092679692691, "throughput/batch_count_per_second_update_recent": 0.107120319736201, "throughput/batch_count_per_second_update_cum": 0.10696188768263053, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 419430400, "throughput/token_count": 419430400, "throughput/batch_count": 200, "throughput/flop_count": 0, "throughput/total_time": 3399.0298356669955, "throughput/update_time": 1869.7792980262311, "throughput/token_count_per_second_total_recent": 129558.04602019044, "throughput/token_count_per_second_total_cum": 123397.09278182746, "throughput/token_count_per_second_update_recent": 224566.92432034877, "throughput/token_count_per_second_update_cum": 224320.80644103687, "throughput/batch_count_per_second_total_recent": 0.061778090486617296, "throughput/batch_count_per_second_total_cum": 0.05884031905261396, "throughput/batch_count_per_second_update_recent": 0.10708185401933135, "throughput/batch_count_per_second_update_cum": 0.10696449586917728, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 440401920, "throughput/token_count": 440401920, "throughput/batch_count": 210, "throughput/flop_count": 0, "throughput/total_time": 3663.3731433160137, "throughput/update_time": 1963.0694206270855, "throughput/token_count_per_second_total_recent": 117061.89949692052, "throughput/token_count_per_second_total_cum": 120217.5980362614, "throughput/token_count_per_second_update_recent": 224530.3488931657, "throughput/token_count_per_second_update_cum": 224343.52823820026, "throughput/batch_count_per_second_total_recent": 0.05581946348997141, "throughput/batch_count_per_second_total_cum": 0.05732421781361647, "throughput/batch_count_per_second_update_recent": 0.10706441349657331, "throughput/batch_count_per_second_update_cum": 0.10697533046636594, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 461373440, "throughput/token_count": 461373440, "throughput/batch_count": 220, "throughput/flop_count": 0, "throughput/total_time": 3756.8895379800815, "throughput/update_time": 2056.446317301248, "throughput/token_count_per_second_total_recent": 129465.56797445728, "throughput/token_count_per_second_total_cum": 122807.29452803149, "throughput/token_count_per_second_update_recent": 224528.49933593356, "throughput/token_count_per_second_update_cum": 224354.72111203842, "throughput/batch_count_per_second_total_recent": 0.06173399351809372, "throughput/batch_count_per_second_total_cum": 0.05855908132936072, "throughput/batch_count_per_second_update_recent": 0.1070635315589588, "throughput/batch_count_per_second_update_cum": 0.10698066764451905, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 482344960, "throughput/token_count": 482344960, "throughput/batch_count": 230, "throughput/flop_count": 0, "throughput/total_time": 4019.5361906640464, "throughput/update_time": 2149.77923363226, "throughput/token_count_per_second_total_recent": 117110.98626028879, "throughput/token_count_per_second_total_cum": 120000.15353022964, "throughput/token_count_per_second_update_recent": 224527.31335335318, "throughput/token_count_per_second_update_cum": 224369.5317425834, "throughput/batch_count_per_second_total_recent": 0.05584286988272132, "throughput/batch_count_per_second_total_cum": 0.0572205321932934, "throughput/batch_count_per_second_update_recent": 0.10706296603839549, "throughput/batch_count_per_second_update_cum": 0.10698772990349932, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 503316480, "throughput/token_count": 503316480, "throughput/batch_count": 240, "throughput/flop_count": 0, "throughput/total_time": 4112.964794773026, "throughput/update_time": 2243.0702600192744, "throughput/token_count_per_second_total_recent": 129434.95689752694, "throughput/token_count_per_second_total_cum": 122373.15540352819, "throughput/token_count_per_second_update_recent": 224543.0238551471, "throughput/token_count_per_second_update_cum": 224387.30028709624, "throughput/batch_count_per_second_total_recent": 0.06171939701916072, "throughput/batch_count_per_second_total_cum": 0.05835206766296777, "throughput/batch_count_per_second_update_recent": 0.10707045738942485, "throughput/batch_count_per_second_update_cum": 0.10699620260577022, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 524288000, "throughput/token_count": 524288000, "throughput/batch_count": 250, "throughput/flop_count": 0, "throughput/total_time": 4376.393655605032, "throughput/update_time": 2336.412492537289, "throughput/token_count_per_second_total_recent": 117042.73880114176, "throughput/token_count_per_second_total_cum": 119799.09515875526, "throughput/token_count_per_second_update_recent": 224571.00929536848, "throughput/token_count_per_second_update_cum": 224398.73167714302, "throughput/batch_count_per_second_total_recent": 0.05581032695824707, "throughput/batch_count_per_second_total_cum": 0.057124660090806606, "throughput/batch_count_per_second_update_recent": 0.10708380188721107, "throughput/batch_count_per_second_update_cum": 0.10700165351731444, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 545259520, "throughput/token_count": 545259520, "throughput/batch_count": 260, "throughput/flop_count": 0, "throughput/total_time": 4470.06449480704, "throughput/update_time": 2429.936827432248, "throughput/token_count_per_second_total_recent": 129367.53301350547, "throughput/token_count_per_second_total_cum": 121980.23554994306, "throughput/token_count_per_second_update_recent": 224575.31418295638, "throughput/token_count_per_second_update_cum": 224392.46726269188, "throughput/batch_count_per_second_total_recent": 0.0616872468059089, "throughput/batch_count_per_second_total_cum": 0.05816470887658265, "throughput/batch_count_per_second_update_recent": 0.10708585461757487, "throughput/batch_count_per_second_update_cum": 0.10699866641172975, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 566231040, "throughput/token_count": 566231040, "throughput/batch_count": 270, "throughput/flop_count": 0, "throughput/total_time": 4734.132506006979, "throughput/update_time": 2523.346581262187, "throughput/token_count_per_second_total_recent": 116945.6700409605, "throughput/token_count_per_second_total_cum": 119606.08184953182, "throughput/token_count_per_second_update_recent": 224585.56252498896, "throughput/token_count_per_second_update_cum": 224396.85622447042, "throughput/batch_count_per_second_total_recent": 0.055764040966491936, "throughput/batch_count_per_second_total_cum": 0.057032624172941124, "throughput/batch_count_per_second_update_recent": 0.10709074140786598, "throughput/batch_count_per_second_update_cum": 0.1070007592317917, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 587202560, "throughput/token_count": 587202560, "throughput/batch_count": 280, "throughput/flop_count": 0, "throughput/total_time": 4827.830736390082, "throughput/update_time": 2616.8884120163275, "throughput/token_count_per_second_total_recent": 129229.88098536289, "throughput/token_count_per_second_total_cum": 121628.65519992722, "throughput/token_count_per_second_update_recent": 224539.47186052342, "throughput/token_count_per_second_update_cum": 224389.60610764334, "throughput/batch_count_per_second_total_recent": 0.0616216092039885, "throughput/batch_count_per_second_total_cum": 0.05799706230160104, "throughput/batch_count_per_second_update_recent": 0.10706876366640254, "throughput/batch_count_per_second_update_cum": 0.10699730210668723, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 608174080, "throughput/token_count": 608174080, "throughput/batch_count": 290, "throughput/flop_count": 0, "throughput/total_time": 5090.937317650998, "throughput/update_time": 2710.3318849493517, "throughput/token_count_per_second_total_recent": 116893.6925539102, "throughput/token_count_per_second_total_cum": 119462.10335204376, "throughput/token_count_per_second_update_recent": 224535.43426215395, "throughput/token_count_per_second_update_cum": 224390.9992636806, "throughput/batch_count_per_second_total_recent": 0.05573925616927633, "throughput/batch_count_per_second_total_cum": 0.05696396987535656, "throughput/batch_count_per_second_update_recent": 0.10706683838947008, "throughput/batch_count_per_second_update_cum": 0.10699796641525297, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 629145600, "throughput/token_count": 629145600, "throughput/batch_count": 300, "throughput/flop_count": 0, "throughput/total_time": 5184.687426269054, "throughput/update_time": 2803.9371257049497, "throughput/token_count_per_second_total_recent": 129324.00418320857, "throughput/token_count_per_second_total_cum": 121346.87171541575, "throughput/token_count_per_second_update_recent": 224487.9932214861, "throughput/token_count_per_second_update_cum": 224379.353671072, "throughput/batch_count_per_second_total_recent": 0.061666490642170224, "throughput/batch_count_per_second_total_cum": 0.057862697465618014, "throughput/batch_count_per_second_update_recent": 0.10704421673845582, "throughput/batch_count_per_second_update_cum": 0.10699241336396789, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 650117120, "throughput/token_count": 650117120, "throughput/batch_count": 310, "throughput/flop_count": 0, "throughput/total_time": 5450.616348118056, "throughput/update_time": 2897.275741666206, "throughput/token_count_per_second_total_recent": 116777.55360394037, "throughput/token_count_per_second_total_cum": 119274.05608440358, "throughput/token_count_per_second_update_recent": 224480.87281584664, "throughput/token_count_per_second_update_cum": 224389.10824073703, "throughput/batch_count_per_second_total_recent": 0.05568387680241602, "throughput/batch_count_per_second_total_cum": 0.05687430195064715, "throughput/batch_count_per_second_update_recent": 0.10704082146446545, "throughput/batch_count_per_second_update_cum": 0.10699706470524646, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 671088640, "throughput/token_count": 671088640, "throughput/batch_count": 320, "throughput/flop_count": 0, "throughput/total_time": 5543.975930526038, "throughput/update_time": 2990.483508925303, "throughput/token_count_per_second_total_recent": 129069.44233971256, "throughput/token_count_per_second_total_cum": 121048.2600230777, "throughput/token_count_per_second_update_recent": 224517.79547697058, "throughput/token_count_per_second_update_cum": 224408.0724729262, "throughput/batch_count_per_second_total_recent": 0.06154510609613064, "throughput/batch_count_per_second_total_cum": 0.057720308314837314, "throughput/batch_count_per_second_update_recent": 0.10705842756126908, "throughput/batch_count_per_second_update_cum": 0.10700610755583105, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 692060160, "throughput/token_count": 692060160, "throughput/batch_count": 330, "throughput/flop_count": 0, "throughput/total_time": 5806.160792221082, "throughput/update_time": 3083.611473838333, "throughput/token_count_per_second_total_recent": 116818.42219183678, "throughput/token_count_per_second_total_cum": 119194.10859706145, "throughput/token_count_per_second_update_recent": 224572.69191659385, "throughput/token_count_per_second_update_cum": 224431.69830943597, "throughput/batch_count_per_second_total_recent": 0.055703364463728325, "throughput/batch_count_per_second_total_cum": 0.05683618001797745, "throughput/batch_count_per_second_update_recent": 0.10708460422353451, "throughput/batch_count_per_second_update_cum": 0.10701737323257254, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 713031680, "throughput/token_count": 713031680, "throughput/batch_count": 340, "throughput/flop_count": 0, "throughput/total_time": 5899.68232584605, "throughput/update_time": 3176.9836701630848, "throughput/token_count_per_second_total_recent": 129163.98801957304, "throughput/token_count_per_second_total_cum": 120859.33455709362, "throughput/token_count_per_second_update_recent": 224559.44231278292, "throughput/token_count_per_second_update_cum": 224436.68398314362, "throughput/batch_count_per_second_total_recent": 0.06159018898943569, "throughput/batch_count_per_second_total_cum": 0.057630221632525264, "throughput/batch_count_per_second_update_recent": 0.10707828632010599, "throughput/batch_count_per_second_update_cum": 0.10701975058705503, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 734003200, "throughput/token_count": 734003200, "throughput/batch_count": 350, "throughput/flop_count": 0, "throughput/total_time": 6162.256411690032, "throughput/update_time": 3270.438694642042, "throughput/token_count_per_second_total_recent": 116870.24330697971, "throughput/token_count_per_second_total_cum": 119112.7325710706, "throughput/token_count_per_second_update_recent": 224531.0935238453, "throughput/token_count_per_second_update_cum": 224435.70069132227, "throughput/batch_count_per_second_total_recent": 0.055728074697007995, "throughput/batch_count_per_second_total_cum": 0.056797376904998115, "throughput/batch_count_per_second_update_recent": 0.10706476856415048, "throughput/batch_count_per_second_update_cum": 0.10701928171697725, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 754974720, "throughput/token_count": 754974720, "throughput/batch_count": 360, "throughput/flop_count": 0, "throughput/total_time": 6255.94410856301, "throughput/update_time": 3363.9935755479382, "throughput/token_count_per_second_total_recent": 129273.40659186934, "throughput/token_count_per_second_total_cum": 120681.18047388017, "throughput/token_count_per_second_update_recent": 224512.85140422042, "throughput/token_count_per_second_update_cum": 224428.10993687087, "throughput/batch_count_per_second_total_recent": 0.0616423638305041, "throughput/batch_count_per_second_total_cum": 0.05754527114576348, "throughput/batch_count_per_second_update_recent": 0.10705607004366895, "throughput/batch_count_per_second_update_cum": 0.10701566216319602, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 775946240, "throughput/token_count": 775946240, "throughput/batch_count": 370, "throughput/flop_count": 0, "throughput/total_time": 6518.670226996997, "throughput/update_time": 3457.366311661084, "throughput/token_count_per_second_total_recent": 116957.66513630486, "throughput/token_count_per_second_total_cum": 119034.43692954855, "throughput/token_count_per_second_update_recent": 224532.01650175478, "throughput/token_count_per_second_update_cum": 224432.75315747445, "throughput/batch_count_per_second_total_recent": 0.05576976067366832, "throughput/batch_count_per_second_total_cum": 0.05676004263379505, "throughput/batch_count_per_second_update_recent": 0.10706520867431392, "throughput/batch_count_per_second_update_cum": 0.10701787622331355, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 796917760, "throughput/token_count": 796917760, "throughput/batch_count": 380, "throughput/flop_count": 0, "throughput/total_time": 6612.111398222041, "throughput/update_time": 3550.66807099001, "throughput/token_count_per_second_total_recent": 129322.96562362855, "throughput/token_count_per_second_total_cum": 120523.94643778788, "throughput/token_count_per_second_update_recent": 224580.55246496102, "throughput/token_count_per_second_update_cum": 224441.6386062808, "throughput/batch_count_per_second_total_recent": 0.061665995418371464, "throughput/batch_count_per_second_total_cum": 0.05747029611482042, "throughput/batch_count_per_second_update_recent": 0.10708835242507983, "throughput/batch_count_per_second_update_cum": 0.10702211313547172, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 817889280, "throughput/token_count": 817889280, "throughput/batch_count": 390, "throughput/flop_count": 0, "throughput/total_time": 6874.636505312985, "throughput/update_time": 3643.886076678871, "throughput/token_count_per_second_total_recent": 117012.52120646667, "throughput/token_count_per_second_total_cum": 118972.00373691082, "throughput/token_count_per_second_update_recent": 224645.14476146828, "throughput/token_count_per_second_update_cum": 224455.22795966352, "throughput/batch_count_per_second_total_recent": 0.05579591808627447, "throughput/batch_count_per_second_total_cum": 0.05673027216764012, "throughput/batch_count_per_second_update_recent": 0.10711915243218817, "throughput/batch_count_per_second_update_cum": 0.10702859304412056, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 838860800, "throughput/token_count": 838860800, "throughput/batch_count": 400, "throughput/flop_count": 0, "throughput/total_time": 6968.171832425054, "throughput/update_time": 3737.2908036899753, "throughput/token_count_per_second_total_recent": 129623.68668942692, "throughput/token_count_per_second_total_cum": 120384.63174752978, "throughput/token_count_per_second_update_recent": 224685.75314142517, "throughput/token_count_per_second_update_cum": 224456.92456464976, "throughput/batch_count_per_second_total_recent": 0.06180939039679857, "throughput/batch_count_per_second_total_cum": 0.057403865693821804, "throughput/batch_count_per_second_update_recent": 0.10713851601668604, "throughput/batch_count_per_second_update_cum": 0.10702940204842079, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 859832320, "throughput/token_count": 859832320, "throughput/batch_count": 410, "throughput/flop_count": 0, "throughput/total_time": 7233.323964387993, "throughput/update_time": 3830.6057550550904, "throughput/token_count_per_second_total_recent": 117076.34737691491, "throughput/token_count_per_second_total_cum": 118870.9816169211, "throughput/token_count_per_second_update_recent": 224693.8896924627, "throughput/token_count_per_second_update_cum": 224463.79893449365, "throughput/batch_count_per_second_total_recent": 0.055826352776009995, "throughput/batch_count_per_second_total_cum": 0.05668210106702857, "throughput/batch_count_per_second_update_recent": 0.10714239582656035, "throughput/batch_count_per_second_update_cum": 0.10703268000340159, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 880803840, "throughput/token_count": 880803840, "throughput/batch_count": 420, "throughput/flop_count": 0, "throughput/total_time": 7326.922928820015, "throughput/update_time": 3924.0719714582665, "throughput/token_count_per_second_total_recent": 129380.86066728763, "throughput/token_count_per_second_total_cum": 120214.69975279944, "throughput/token_count_per_second_update_recent": 224624.37543521074, "throughput/token_count_per_second_update_cum": 224461.69346702247, "throughput/batch_count_per_second_total_recent": 0.061693601926463906, "throughput/batch_count_per_second_total_cum": 0.0573228358043668, "throughput/batch_count_per_second_update_recent": 0.10710924884567773, "throughput/batch_count_per_second_update_cum": 0.10703167603827594, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 901775360, "throughput/token_count": 901775360, "throughput/batch_count": 430, "throughput/flop_count": 0, "throughput/total_time": 7591.2802984340815, "throughput/update_time": 4017.3633328623837, "throughput/token_count_per_second_total_recent": 116917.42884406337, "throughput/token_count_per_second_total_cum": 118790.94494587652, "throughput/token_count_per_second_update_recent": 224591.63878955206, "throughput/token_count_per_second_update_cum": 224469.45553154196, "throughput/batch_count_per_second_total_recent": 0.055750574514419254, "throughput/batch_count_per_second_total_cum": 0.05664393660825563, "throughput/batch_count_per_second_update_recent": 0.10709363879659274, "throughput/batch_count_per_second_update_cum": 0.1070353772790632, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 922746880, "throughput/token_count": 922746880, "throughput/batch_count": 440, "throughput/flop_count": 0, "throughput/total_time": 7684.845341357053, "throughput/update_time": 4110.783207958448, "throughput/token_count_per_second_total_recent": 129213.52560337495, "throughput/token_count_per_second_total_cum": 120073.57845370167, "throughput/token_count_per_second_update_recent": 224589.70739768277, "throughput/token_count_per_second_update_cum": 224469.84754962716, "throughput/batch_count_per_second_total_recent": 0.06161381035012004, "throughput/batch_count_per_second_total_cum": 0.057255543925143086, "throughput/batch_count_per_second_update_recent": 0.10709271783718241, "throughput/batch_count_per_second_update_cum": 0.10703556420785292, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 943718400, "throughput/token_count": 943718400, "throughput/batch_count": 450, "throughput/flop_count": 0, "throughput/total_time": 7948.907636292046, "throughput/update_time": 4204.296549194376, "throughput/token_count_per_second_total_recent": 116818.08410102178, "throughput/token_count_per_second_total_cum": 118723.030028843, "throughput/token_count_per_second_update_recent": 224571.1785686059, "throughput/token_count_per_second_update_cum": 224465.23192585798, "throughput/batch_count_per_second_total_recent": 0.055703203249464886, "throughput/batch_count_per_second_total_cum": 0.05661155225221777, "throughput/batch_count_per_second_update_recent": 0.10708388260298056, "throughput/batch_count_per_second_update_cum": 0.10703336330693149, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 964689920, "throughput/token_count": 964689920, "throughput/batch_count": 460, "throughput/flop_count": 0, "throughput/total_time": 8042.599143097992, "throughput/update_time": 4297.83159168635, "throughput/token_count_per_second_total_recent": 129107.34071039618, "throughput/token_count_per_second_total_cum": 119947.53223873887, "throughput/token_count_per_second_update_recent": 224566.5535687632, "throughput/token_count_per_second_update_cum": 224459.68377776345, "throughput/batch_count_per_second_total_recent": 0.061563177447507944, "throughput/batch_count_per_second_total_cum": 0.057195440406197964, "throughput/batch_count_per_second_update_recent": 0.10708167723119888, "throughput/batch_count_per_second_update_cum": 0.1070307177437608, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 985661440, "throughput/token_count": 985661440, "throughput/batch_count": 470, "throughput/flop_count": 0, "throughput/total_time": 8305.275031784084, "throughput/update_time": 4391.248921588529, "throughput/token_count_per_second_total_recent": 116819.36783111202, "throughput/token_count_per_second_total_cum": 118678.96442055174, "throughput/token_count_per_second_update_recent": 224559.12343167048, "throughput/token_count_per_second_update_cum": 224460.38874139666, "throughput/batch_count_per_second_total_recent": 0.05570381537967301, "throughput/batch_count_per_second_total_cum": 0.05659054013278567, "throughput/batch_count_per_second_update_recent": 0.10707813426574253, "throughput/batch_count_per_second_update_cum": 0.1070310538966163, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1006632960, "throughput/token_count": 1006632960, "throughput/batch_count": 480, "throughput/flop_count": 0, "throughput/total_time": 8398.81305668701, "throughput/update_time": 4484.626386589487, "throughput/token_count_per_second_total_recent": 129099.14227288493, "throughput/token_count_per_second_total_cum": 119854.19287294814, "throughput/token_count_per_second_update_recent": 224535.74415480194, "throughput/token_count_per_second_update_cum": 224463.05962301893, "throughput/batch_count_per_second_total_recent": 0.06155926812786337, "throughput/batch_count_per_second_total_cum": 0.05715093272826583, "throughput/batch_count_per_second_update_recent": 0.10706698615779969, "throughput/batch_count_per_second_update_cum": 0.10703232747221895, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1027604480, "throughput/token_count": 1027604480, "throughput/batch_count": 490, "throughput/flop_count": 0, "throughput/total_time": 8662.347046800074, "throughput/update_time": 4577.979232876445, "throughput/token_count_per_second_total_recent": 116751.37245590844, "throughput/token_count_per_second_total_cum": 118628.87442031124, "throughput/token_count_per_second_update_recent": 224525.1462226793, "throughput/token_count_per_second_update_cum": 224466.82864359205, "throughput/batch_count_per_second_total_recent": 0.05567139265818998, "throughput/batch_count_per_second_total_cum": 0.0565666553594166, "throughput/batch_count_per_second_update_recent": 0.1070619326699635, "throughput/batch_count_per_second_update_cum": 0.10703412468127825, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1048576000, "throughput/token_count": 1048576000, "throughput/batch_count": 500, "throughput/flop_count": 0, "throughput/total_time": 8755.968361919047, "throughput/update_time": 4671.454424570431, "throughput/token_count_per_second_total_recent": 129215.69322892043, "throughput/token_count_per_second_total_cum": 119755.57204619494, "throughput/token_count_per_second_update_recent": 224487.94504038664, "throughput/token_count_per_second_update_cum": 224464.5681406649, "throughput/batch_count_per_second_total_recent": 0.0616148439545252, "throughput/batch_count_per_second_total_cum": 0.05710390665349719, "throughput/batch_count_per_second_update_recent": 0.10704419376391727, "throughput/batch_count_per_second_update_cum": 0.10703304678948636, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1069547520, "throughput/token_count": 1069547520, "throughput/batch_count": 510, "throughput/flop_count": 0, "throughput/total_time": 9021.309424316045, "throughput/update_time": 4764.801301084226, "throughput/token_count_per_second_total_recent": 116730.67923711608, "throughput/token_count_per_second_total_cum": 118557.90215078321, "throughput/token_count_per_second_update_recent": 224490.77415512942, "throughput/token_count_per_second_update_cum": 224468.44105684437, "throughput/batch_count_per_second_total_recent": 0.05566152536254696, "throughput/batch_count_per_second_total_cum": 0.056532813144103626, "throughput/batch_count_per_second_update_recent": 0.10704554279095145, "throughput/batch_count_per_second_update_cum": 0.10703489353983134, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1090519040, "throughput/token_count": 1090519040, "throughput/batch_count": 520, "throughput/flop_count": 0, "throughput/total_time": 9114.625705051003, "throughput/update_time": 4857.963408218231, "throughput/token_count_per_second_total_recent": 129159.94322405629, "throughput/token_count_per_second_total_cum": 119644.96132799759, "throughput/token_count_per_second_update_recent": 224553.88055704377, "throughput/token_count_per_second_update_cum": 224480.70278898472, "throughput/batch_count_per_second_total_recent": 0.06158826028063597, "throughput/batch_count_per_second_total_cum": 0.05705116335296516, "throughput/batch_count_per_second_update_recent": 0.10707563426830471, "throughput/batch_count_per_second_update_cum": 0.10704074038933979, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1111490560, "throughput/token_count": 1111490560, "throughput/batch_count": 530, "throughput/flop_count": 0, "throughput/total_time": 9398.054264329025, "throughput/update_time": 4970.6049944133265, "throughput/token_count_per_second_total_recent": 115509.3150142174, "throughput/token_count_per_second_total_cum": 118268.15729493504, "throughput/token_count_per_second_update_recent": 219956.33110559382, "throughput/token_count_per_second_update_cum": 223612.73149832894, "throughput/batch_count_per_second_total_recent": 0.05507913351736898, "throughput/batch_count_per_second_total_cum": 0.05639465203043701, "throughput/batch_count_per_second_update_recent": 0.10488335185317699, "throughput/batch_count_per_second_update_cum": 0.10662685942570159, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1132462080, "throughput/token_count": 1132462080, "throughput/batch_count": 540, "throughput/flop_count": 0, "throughput/total_time": 9606.105255214032, "throughput/update_time": 5178.118606777163, "throughput/token_count_per_second_total_recent": 119216.26091216251, "throughput/token_count_per_second_total_cum": 117889.82630450762, "throughput/token_count_per_second_update_recent": 196232.45082067943, "throughput/token_count_per_second_update_cum": 218701.4562620919, "throughput/batch_count_per_second_total_recent": 0.056846743064957864, "throughput/batch_count_per_second_total_cum": 0.056214249756101425, "throughput/batch_count_per_second_update_recent": 0.09357092419656726, "throughput/batch_count_per_second_update_cum": 0.10428498089890094, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1153433600, "throughput/token_count": 1153433600, "throughput/batch_count": 550, "throughput/flop_count": 0, "throughput/total_time": 10003.42233235808, "throughput/update_time": 5362.020449607167, "throughput/token_count_per_second_total_recent": 101518.23119249807, "throughput/token_count_per_second_total_cum": 115303.89917348456, "throughput/token_count_per_second_update_recent": 180794.0299722606, "throughput/token_count_per_second_update_cum": 215111.74954293636, "throughput/batch_count_per_second_total_recent": 0.048407664867638624, "throughput/batch_count_per_second_total_cum": 0.05498118361162403, "throughput/batch_count_per_second_update_recent": 0.08620931147206336, "throughput/batch_count_per_second_update_cum": 0.10257327534815615, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1174405120, "throughput/token_count": 1174405120, "throughput/batch_count": 560, "throughput/flop_count": 0, "throughput/total_time": 10150.524502051994, "throughput/update_time": 5508.941767138196, "throughput/token_count_per_second_total_recent": 107603.4727303254, "throughput/token_count_per_second_total_cum": 115698.95917817709, "throughput/token_count_per_second_update_recent": 172756.31406939292, "throughput/token_count_per_second_update_cum": 213181.61811140072, "throughput/batch_count_per_second_total_recent": 0.05130933414951582, "throughput/batch_count_per_second_total_cum": 0.05516956290158133, "throughput/batch_count_per_second_update_recent": 0.08237662986249586, "throughput/batch_count_per_second_update_cum": 0.10165291696138416, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1195376640, "throughput/token_count": 1195376640, "throughput/batch_count": 570, "throughput/flop_count": 0, "throughput/total_time": 10566.198982721078, "throughput/update_time": 5678.196010601241, "throughput/token_count_per_second_total_recent": 92210.5532831218, "throughput/token_count_per_second_total_cum": 113132.1340772402, "throughput/token_count_per_second_update_recent": 162504.396500761, "throughput/token_count_per_second_update_cum": 210520.49590542866, "throughput/batch_count_per_second_total_recent": 0.04396941818386164, "throughput/batch_count_per_second_total_cum": 0.053945605314846136, "throughput/batch_count_per_second_update_recent": 0.07748813462293673, "throughput/batch_count_per_second_update_cum": 0.10038399501105721, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1216348160, "throughput/token_count": 1216348160, "throughput/batch_count": 580, "throughput/flop_count": 0, "throughput/total_time": 10760.646769667044, "throughput/update_time": 5872.135236821254, "throughput/token_count_per_second_total_recent": 95132.42119636018, "throughput/token_count_per_second_total_cum": 113036.71480312296, "throughput/token_count_per_second_update_recent": 150652.10213752172, "throughput/token_count_per_second_update_cum": 207138.98964262314, "throughput/batch_count_per_second_total_recent": 0.045362673376255123, "throughput/batch_count_per_second_total_cum": 0.053900105859338264, "throughput/batch_count_per_second_update_recent": 0.07183652026058279, "throughput/batch_count_per_second_update_cum": 0.09877156717425496, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1237319680, "throughput/token_count": 1237319680, "throughput/batch_count": 590, "throughput/flop_count": 0, "throughput/total_time": 11156.295638620039, "throughput/update_time": 6039.766134388163, "throughput/token_count_per_second_total_recent": 83562.53331182865, "throughput/token_count_per_second_total_cum": 110907.75290292042, "throughput/token_count_per_second_update_recent": 142943.50515945963, "throughput/token_count_per_second_update_cum": 204862.18381125154, "throughput/batch_count_per_second_total_recent": 0.03984572091666634, "throughput/batch_count_per_second_total_cum": 0.05288493771692296, "throughput/batch_count_per_second_update_recent": 0.06816077478383047, "throughput/batch_count_per_second_update_cum": 0.09768590155184342, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1258291200, "throughput/token_count": 1258291200, "throughput/batch_count": 600, "throughput/flop_count": 0, "throughput/total_time": 11310.365665178979, "throughput/update_time": 6193.65441437799, "throughput/token_count_per_second_total_recent": 87483.72753738798, "throughput/token_count_per_second_total_cum": 111251.15113420945, "throughput/token_count_per_second_update_recent": 137232.68247600523, "throughput/token_count_per_second_update_cum": 203158.12213852204, "throughput/batch_count_per_second_total_recent": 0.041715492027944556, "throughput/batch_count_per_second_total_cum": 0.0530486827536628, "throughput/batch_count_per_second_update_recent": 0.06543764232445012, "throughput/batch_count_per_second_update_cum": 0.09687334162641623, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1279262720, "throughput/token_count": 1279262720, "throughput/batch_count": 610, "throughput/flop_count": 0, "throughput/total_time": 11736.789658116992, "throughput/update_time": 6379.89381142694, "throughput/token_count_per_second_total_recent": 76720.99175235892, "throughput/token_count_per_second_total_cum": 108995.9654440327, "throughput/token_count_per_second_update_recent": 129294.0809366229, "throughput/token_count_per_second_update_cum": 200514.73548176148, "throughput/batch_count_per_second_total_recent": 0.03658341968172022, "throughput/batch_count_per_second_total_cum": 0.051973326417938566, "throughput/batch_count_per_second_update_recent": 0.0616522221263041, "throughput/batch_count_per_second_update_cum": 0.09561287664497446, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1300234240, "throughput/token_count": 1300234240, "throughput/batch_count": 620, "throughput/flop_count": 0, "throughput/total_time": 11914.080722624087, "throughput/update_time": 6556.690899457899, "throughput/token_count_per_second_total_recent": 79259.06095347612, "throughput/token_count_per_second_total_cum": 109134.24797692845, "throughput/token_count_per_second_update_recent": 122893.13366319254, "throughput/token_count_per_second_update_cum": 198306.47195943035, "throughput/batch_count_per_second_total_recent": 0.03779366538690382, "throughput/batch_count_per_second_total_cum": 0.05203926466795371, "throughput/batch_count_per_second_update_recent": 0.058600012618633526, "throughput/batch_count_per_second_update_cum": 0.09455989454242246, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1321205760, "throughput/token_count": 1321205760, "throughput/batch_count": 630, "throughput/flop_count": 0, "throughput/total_time": 12308.019568020012, "throughput/update_time": 6703.663481310825, "throughput/token_count_per_second_total_recent": 71931.54807924517, "throughput/token_count_per_second_total_cum": 107345.11370398659, "throughput/token_count_per_second_update_recent": 121429.83909176495, "throughput/token_count_per_second_update_cum": 197087.12462721253, "throughput/batch_count_per_second_total_recent": 0.03429963497125872, "throughput/batch_count_per_second_total_cum": 0.05118613896560029, "throughput/batch_count_per_second_update_recent": 0.05790225939357994, "throughput/batch_count_per_second_update_cum": 0.09397846442566515, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1342177280, "throughput/token_count": 1342177280, "throughput/batch_count": 640, "throughput/flop_count": 0, "throughput/total_time": 12483.778021455975, "throughput/update_time": 6879.228759005899, "throughput/token_count_per_second_total_recent": 78608.23739856138, "throughput/token_count_per_second_total_cum": 107513.70920671518, "throughput/token_count_per_second_update_recent": 123738.33769575204, "throughput/token_count_per_second_update_cum": 195105.77813579715, "throughput/batch_count_per_second_total_recent": 0.03748332853248662, "throughput/batch_count_per_second_total_cum": 0.051266531566007224, "throughput/batch_count_per_second_update_recent": 0.05900303730762102, "throughput/batch_count_per_second_update_cum": 0.09303368479528291, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1363148800, "throughput/token_count": 1363148800, "throughput/batch_count": 650, "throughput/flop_count": 0, "throughput/total_time": 12905.641336273053, "throughput/update_time": 7080.2444989720825, "throughput/token_count_per_second_total_recent": 71902.99004326043, "throughput/token_count_per_second_total_cum": 105624.25876261458, "throughput/token_count_per_second_update_recent": 121874.93834073495, "throughput/token_count_per_second_update_cum": 192528.49251150907, "throughput/batch_count_per_second_total_recent": 0.0342860174385359, "throughput/batch_count_per_second_total_cum": 0.05036557138567666, "throughput/batch_count_per_second_update_recent": 0.05811449925457714, "throughput/batch_count_per_second_update_cum": 0.09180473924231962, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1384120320, "throughput/token_count": 1384120320, "throughput/batch_count": 660, "throughput/flop_count": 0, "throughput/total_time": 13068.73947558098, "throughput/update_time": 7242.851195934112, "throughput/token_count_per_second_total_recent": 78132.66016599408, "throughput/token_count_per_second_total_cum": 105910.77453079827, "throughput/token_count_per_second_update_recent": 120762.36436210027, "throughput/token_count_per_second_update_cum": 191101.58176064663, "throughput/batch_count_per_second_total_recent": 0.03725655563640312, "throughput/batch_count_per_second_total_cum": 0.050502192750357756, "throughput/batch_count_per_second_update_recent": 0.05758398264031423, "throughput/batch_count_per_second_update_cum": 0.09112433517486888, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1405091840, "throughput/token_count": 1405091840, "throughput/batch_count": 670, "throughput/flop_count": 0, "throughput/total_time": 13461.124407541007, "throughput/update_time": 7389.911005128175, "throughput/token_count_per_second_total_recent": 72309.30064031856, "throughput/token_count_per_second_total_cum": 104381.46156741993, "throughput/token_count_per_second_update_recent": 122967.68270484045, "throughput/token_count_per_second_update_cum": 190136.50354178104, "throughput/batch_count_per_second_total_recent": 0.03447976142898491, "throughput/batch_count_per_second_total_cum": 0.04977295950289723, "throughput/batch_count_per_second_update_recent": 0.058635560371799685, "throughput/batch_count_per_second_update_cum": 0.09066415001954128, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1426063360, "throughput/token_count": 1426063360, "throughput/batch_count": 680, "throughput/flop_count": 0, "throughput/total_time": 13651.783316303976, "throughput/update_time": 7580.372131183161, "throughput/token_count_per_second_total_recent": 78647.12341375464, "throughput/token_count_per_second_total_cum": 104459.85897658432, "throughput/token_count_per_second_update_recent": 123226.4636210262, "throughput/token_count_per_second_update_cum": 188125.7721020903, "throughput/batch_count_per_second_total_recent": 0.03750187082946522, "throughput/batch_count_per_second_total_cum": 0.049810342300693665, "throughput/batch_count_per_second_update_recent": 0.058758956728470896, "throughput/batch_count_per_second_update_cum": 0.08970535855392947, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1447034880, "throughput/token_count": 1447034880, "throughput/batch_count": 690, "throughput/flop_count": 0, "throughput/total_time": 14071.216295699007, "throughput/update_time": 7793.8012771270005, "throughput/token_count_per_second_total_recent": 71588.55897690839, "throughput/token_count_per_second_total_cum": 102836.51744037928, "throughput/token_count_per_second_update_recent": 119367.07593783569, "throughput/token_count_per_second_update_cum": 185664.84165393232, "throughput/batch_count_per_second_total_recent": 0.03413608502240581, "throughput/batch_count_per_second_total_cum": 0.04903627273577656, "throughput/batch_count_per_second_update_recent": 0.05691865727321419, "throughput/batch_count_per_second_update_cum": 0.08853189547249428, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1468006400, "throughput/token_count": 1468006400, "throughput/batch_count": 700, "throughput/flop_count": 0, "throughput/total_time": 14221.494554364006, "throughput/update_time": 7943.600337238051, "throughput/token_count_per_second_total_recent": 78156.6384454736, "throughput/token_count_per_second_total_cum": 103224.48139246572, "throughput/token_count_per_second_update_recent": 119644.76141885245, "throughput/token_count_per_second_update_cum": 184803.6579985365, "throughput/batch_count_per_second_total_recent": 0.037267989371048735, "throughput/batch_count_per_second_total_cum": 0.04922126836417471, "throughput/batch_count_per_second_update_recent": 0.05705106802885649, "throughput/batch_count_per_second_update_cum": 0.08812125110556436, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1488977920, "throughput/token_count": 1488977920, "throughput/batch_count": 710, "throughput/flop_count": 0, "throughput/total_time": 14616.199775510002, "throughput/update_time": 8090.416964606033, "throughput/token_count_per_second_total_recent": 72699.85025739913, "throughput/token_count_per_second_total_cum": 101871.75482472805, "throughput/token_count_per_second_update_recent": 123047.42345948138, "throughput/token_count_per_second_update_cum": 184042.17316783333, "throughput/batch_count_per_second_total_recent": 0.034665989998530926, "throughput/batch_count_per_second_total_cum": 0.04857623807178881, "throughput/batch_count_per_second_update_recent": 0.058673583726635635, "throughput/batch_count_per_second_update_cum": 0.08775814684287707, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1509949440, "throughput/token_count": 1509949440, "throughput/batch_count": 720, "throughput/flop_count": 0, "throughput/total_time": 14821.623726570979, "throughput/update_time": 8295.638736715424, "throughput/token_count_per_second_total_recent": 78465.25569253047, "throughput/token_count_per_second_total_cum": 101874.76540057402, "throughput/token_count_per_second_update_recent": 120411.52851180467, "throughput/token_count_per_second_update_cum": 182017.26086710588, "throughput/batch_count_per_second_total_recent": 0.037415149542107805, "throughput/batch_count_per_second_total_cum": 0.04857767362621976, "throughput/batch_count_per_second_update_recent": 0.0574166910704635, "throughput/batch_count_per_second_update_cum": 0.08679259341578764, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1530920960, "throughput/token_count": 1530920960, "throughput/batch_count": 730, "throughput/flop_count": 0, "throughput/total_time": 15232.941850902047, "throughput/update_time": 8512.26620015048, "throughput/token_count_per_second_total_recent": 71341.74912418645, "throughput/token_count_per_second_total_cum": 100500.67642773439, "throughput/token_count_per_second_update_recent": 115734.04821611205, "throughput/token_count_per_second_update_cum": 179848.8115859131, "throughput/batch_count_per_second_total_recent": 0.03401839691361735, "throughput/batch_count_per_second_total_cum": 0.04792245694529266, "throughput/batch_count_per_second_update_recent": 0.055186294658714316, "throughput/batch_count_per_second_update_cum": 0.08575859622283606, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1551892480, "throughput/token_count": 1551892480, "throughput/batch_count": 740, "throughput/flop_count": 0, "throughput/total_time": 15380.238736488041, "throughput/update_time": 8659.387788099353, "throughput/token_count_per_second_total_recent": 78020.63925665422, "throughput/token_count_per_second_total_cum": 100901.715934896, "throughput/token_count_per_second_update_recent": 117600.87935099858, "throughput/token_count_per_second_update_cum": 179215.0343622184, "throughput/batch_count_per_second_total_recent": 0.03720313990433417, "throughput/batch_count_per_second_total_cum": 0.04811368748421478, "throughput/batch_count_per_second_update_recent": 0.05607646911191873, "throughput/batch_count_per_second_update_cum": 0.08545638769255562, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1572864000, "throughput/token_count": 1572864000, "throughput/batch_count": 750, "throughput/flop_count": 0, "throughput/total_time": 15773.814038521028, "throughput/update_time": 8806.402406750247, "throughput/token_count_per_second_total_recent": 72991.06663849614, "throughput/token_count_per_second_total_cum": 99713.6137245519, "throughput/token_count_per_second_update_recent": 121929.10356330319, "throughput/token_count_per_second_update_cum": 178604.60235094128, "throughput/batch_count_per_second_total_recent": 0.03480485279011542, "throughput/batch_count_per_second_total_cum": 0.04754715620257945, "throughput/batch_count_per_second_update_recent": 0.058140327245380014, "throughput/batch_count_per_second_update_cum": 0.08516531102702202, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1593835520, "throughput/token_count": 1593835520, "throughput/batch_count": 760, "throughput/flop_count": 0, "throughput/total_time": 15988.252661015023, "throughput/update_time": 9020.618778260308, "throughput/token_count_per_second_total_recent": 78063.02847106171, "throughput/token_count_per_second_total_cum": 99687.91173073783, "throughput/token_count_per_second_update_recent": 117759.86866995473, "throughput/token_count_per_second_update_cum": 176688.04759171774, "throughput/batch_count_per_second_total_recent": 0.037223352656870705, "throughput/batch_count_per_second_total_cum": 0.047534900536888997, "throughput/batch_count_per_second_update_recent": 0.056152281126954424, "throughput/batch_count_per_second_update_cum": 0.08425142650209319, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1614807040, "throughput/token_count": 1614807040, "throughput/batch_count": 770, "throughput/flop_count": 0, "throughput/total_time": 16392.140212937025, "throughput/update_time": 9237.03149983345, "throughput/token_count_per_second_total_recent": 71192.91558751644, "throughput/token_count_per_second_total_cum": 98511.05584892203, "throughput/token_count_per_second_update_recent": 113302.19701691475, "throughput/token_count_per_second_update_cum": 174818.83005694154, "throughput/batch_count_per_second_total_recent": 0.0339474275529463, "throughput/batch_count_per_second_total_cum": 0.046973731922589315, "throughput/batch_count_per_second_update_recent": 0.05402669764371622, "throughput/batch_count_per_second_update_cum": 0.08336011412474706, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1635778560, "throughput/token_count": 1635778560, "throughput/batch_count": 780, "throughput/flop_count": 0, "throughput/total_time": 16539.33981846203, "throughput/update_time": 9384.053210339276, "throughput/token_count_per_second_total_recent": 77841.96383942213, "throughput/token_count_per_second_total_cum": 98902.2886012695, "throughput/token_count_per_second_update_recent": 116043.41011144903, "throughput/token_count_per_second_update_cum": 174314.71490354638, "throughput/batch_count_per_second_total_recent": 0.037117940826140464, "throughput/batch_count_per_second_total_cum": 0.04716028623641467, "throughput/batch_count_per_second_update_recent": 0.05533380990574314, "throughput/batch_count_per_second_update_cum": 0.08311973328759498, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1656750080, "throughput/token_count": 1656750080, "throughput/batch_count": 790, "throughput/flop_count": 0, "throughput/total_time": 16932.73472051299, "throughput/update_time": 9530.820382265258, "throughput/token_count_per_second_total_recent": 73013.63554982176, "throughput/token_count_per_second_total_cum": 97843.0305172706, "throughput/token_count_per_second_update_recent": 120749.5623616512, "throughput/token_count_per_second_update_cum": 173830.7945749187, "throughput/batch_count_per_second_total_recent": 0.03481561448565567, "throughput/batch_count_per_second_total_cum": 0.04665519262183695, "throughput/batch_count_per_second_update_recent": 0.05757787817080078, "throughput/batch_count_per_second_update_cum": 0.08288898209329543, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1677721600, "throughput/token_count": 1677721600, "throughput/batch_count": 800, "throughput/flop_count": 0, "throughput/total_time": 17149.437690322986, "throughput/update_time": 9747.28814456216, "throughput/token_count_per_second_total_recent": 77890.61070606952, "throughput/token_count_per_second_total_cum": 97829.53997067192, "throughput/token_count_per_second_update_recent": 116051.18183575623, "throughput/token_count_per_second_update_cum": 172121.8840684392, "throughput/batch_count_per_second_total_recent": 0.037141137459788094, "throughput/batch_count_per_second_total_cum": 0.046648759827934226, "throughput/batch_count_per_second_update_recent": 0.05533751575267612, "throughput/batch_count_per_second_update_cum": 0.08207411006376228, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1698693120, "throughput/token_count": 1698693120, "throughput/batch_count": 810, "throughput/flop_count": 0, "throughput/total_time": 17549.297566386987, "throughput/update_time": 9954.213539150194, "throughput/token_count_per_second_total_recent": 71141.40830327584, "throughput/token_count_per_second_total_cum": 96795.50498098502, "throughput/token_count_per_second_update_recent": 112278.35825361656, "throughput/token_count_per_second_update_cum": 170650.6609807991, "throughput/batch_count_per_second_total_recent": 0.03392286696590225, "throughput/batch_count_per_second_total_cum": 0.046155693521969327, "throughput/batch_count_per_second_update_recent": 0.05353849327736691, "throughput/batch_count_per_second_update_cum": 0.08137257622756915, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1719664640, "throughput/token_count": 1719664640, "throughput/batch_count": 820, "throughput/flop_count": 0, "throughput/total_time": 17696.358517175075, "throughput/update_time": 10101.101109507843, "throughput/token_count_per_second_total_recent": 77883.41207897212, "throughput/token_count_per_second_total_cum": 97176.18674661184, "throughput/token_count_per_second_update_recent": 115922.41139822273, "throughput/token_count_per_second_update_cum": 170245.26547718, "throughput/batch_count_per_second_total_recent": 0.037137704886900005, "throughput/batch_count_per_second_total_cum": 0.04633721673327057, "throughput/batch_count_per_second_update_recent": 0.05527611322318207, "throughput/batch_count_per_second_update_cum": 0.08117926858767509, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1740636160, "throughput/token_count": 1740636160, "throughput/batch_count": 830, "throughput/flop_count": 0, "throughput/total_time": 18089.859054057975, "throughput/update_time": 10248.098395384499, "throughput/token_count_per_second_total_recent": 73049.40806125935, "throughput/token_count_per_second_total_cum": 96221.65406587481, "throughput/token_count_per_second_update_recent": 120629.0513034346, "throughput/token_count_per_second_update_cum": 169849.67287042653, "throughput/batch_count_per_second_total_recent": 0.03483267214835136, "throughput/batch_count_per_second_total_cum": 0.045882060082375914, "throughput/batch_count_per_second_update_recent": 0.05752041402026872, "throughput/batch_count_per_second_update_cum": 0.0809906353332646, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1761607680, "throughput/token_count": 1761607680, "throughput/batch_count": 840, "throughput/flop_count": 0, "throughput/total_time": 18306.998820198, "throughput/update_time": 10465.017247352516, "throughput/token_count_per_second_total_recent": 77887.98132338678, "throughput/token_count_per_second_total_cum": 96225.91323141557, "throughput/token_count_per_second_update_recent": 115929.63905089132, "throughput/token_count_per_second_update_cum": 168332.99347362842, "throughput/batch_count_per_second_total_recent": 0.037139883672421825, "throughput/batch_count_per_second_total_cum": 0.045884091010768686, "throughput/batch_count_per_second_update_recent": 0.05527955963654104, "throughput/batch_count_per_second_update_cum": 0.08026742623979016, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1782579200, "throughput/token_count": 1782579200, "throughput/batch_count": 850, "throughput/flop_count": 0, "throughput/total_time": 18706.20748156903, "throughput/update_time": 10662.364285671501, "throughput/token_count_per_second_total_recent": 71159.4793552539, "throughput/token_count_per_second_total_cum": 95293.45816122007, "throughput/token_count_per_second_update_recent": 112758.28728588998, "throughput/token_count_per_second_update_cum": 167184.23346269448, "throughput/batch_count_per_second_total_recent": 0.033931483914973214, "throughput/batch_count_per_second_total_cum": 0.04543946178494457, "throughput/batch_count_per_second_update_recent": 0.05376734127325534, "throughput/batch_count_per_second_update_cum": 0.07971965478071903, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1803550720, "throughput/token_count": 1803550720, "throughput/batch_count": 860, "throughput/flop_count": 0, "throughput/total_time": 18853.344159234082, "throughput/update_time": 10809.319413250545, "throughput/token_count_per_second_total_recent": 78216.84499152997, "throughput/token_count_per_second_total_cum": 95662.11197161264, "throughput/token_count_per_second_update_recent": 117610.12978451849, "throughput/token_count_per_second_update_cum": 166851.45947201145, "throughput/batch_count_per_second_total_recent": 0.03729669808937548, "throughput/batch_count_per_second_total_cum": 0.045615249620252915, "throughput/batch_count_per_second_update_recent": 0.056080880062350506, "throughput/batch_count_per_second_update_cum": 0.07956097577667783, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1824522240, "throughput/token_count": 1824522240, "throughput/batch_count": 870, "throughput/flop_count": 0, "throughput/total_time": 19254.14253187005, "throughput/update_time": 10965.228872205596, "throughput/token_count_per_second_total_recent": 72916.28641481613, "throughput/token_count_per_second_total_cum": 94759.98409069605, "throughput/token_count_per_second_update_recent": 121159.43611932639, "throughput/token_count_per_second_update_cum": 166391.62403848732, "throughput/batch_count_per_second_total_recent": 0.034769194800766054, "throughput/batch_count_per_second_total_cum": 0.04518508152518084, "throughput/batch_count_per_second_update_recent": 0.057773321208632655, "throughput/batch_count_per_second_update_cum": 0.07934170915531508, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1845493760, "throughput/token_count": 1845493760, "throughput/batch_count": 880, "throughput/flop_count": 0, "throughput/total_time": 19462.212121990975, "throughput/update_time": 11172.7729124584, "throughput/token_count_per_second_total_recent": 78002.02596170326, "throughput/token_count_per_second_total_cum": 94824.46026342081, "throughput/token_count_per_second_update_recent": 117030.7644810427, "throughput/token_count_per_second_update_cum": 165177.7740816829, "throughput/batch_count_per_second_total_recent": 0.03719426439366496, "throughput/batch_count_per_second_total_cum": 0.04521582616015473, "throughput/batch_count_per_second_update_recent": 0.05580461715747962, "throughput/batch_count_per_second_update_cum": 0.07876290039142747, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1866465280, "throughput/token_count": 1866465280, "throughput/batch_count": 890, "throughput/flop_count": 0, "throughput/total_time": 19858.031892596046, "throughput/update_time": 11356.353174015298, "throughput/token_count_per_second_total_recent": 71332.52801662663, "throughput/token_count_per_second_total_cum": 93990.44628868287, "throughput/token_count_per_second_update_recent": 114651.87284699024, "throughput/token_count_per_second_update_cum": 164354.2826997224, "throughput/batch_count_per_second_total_recent": 0.03401399994689304, "throughput/batch_count_per_second_total_cum": 0.044818137306538997, "throughput/batch_count_per_second_update_recent": 0.05467027323102486, "throughput/batch_count_per_second_update_cum": 0.07837022910104866, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1887436800, "throughput/token_count": 1887436800, "throughput/batch_count": 900, "throughput/flop_count": 0, "throughput/total_time": 20004.831071814988, "throughput/update_time": 11502.97223422851, "throughput/token_count_per_second_total_recent": 78663.11417493952, "throughput/token_count_per_second_total_cum": 94349.04964827366, "throughput/token_count_per_second_update_recent": 119844.72524997064, "throughput/token_count_per_second_update_cum": 164082.53115518263, "throughput/batch_count_per_second_total_recent": 0.03750949581858612, "throughput/batch_count_per_second_total_cum": 0.044989132713448364, "throughput/batch_count_per_second_update_recent": 0.0571464182138303, "throughput/batch_count_per_second_update_cum": 0.078240647866813, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1908408320, "throughput/token_count": 1908408320, "throughput/batch_count": 910, "throughput/flop_count": 0, "throughput/total_time": 20424.197218824993, "throughput/update_time": 11674.226087162388, "throughput/token_count_per_second_total_recent": 72590.71272661419, "throughput/token_count_per_second_total_cum": 93438.59636456208, "throughput/token_count_per_second_update_recent": 121749.7153621837, "throughput/token_count_per_second_update_cum": 163471.93430651384, "throughput/batch_count_per_second_total_recent": 0.03461394916849813, "throughput/batch_count_per_second_total_cum": 0.04455499475696663, "throughput/batch_count_per_second_update_recent": 0.05805478828534303, "throughput/batch_count_per_second_update_cum": 0.077949492600686, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1929379840, "throughput/token_count": 1929379840, "throughput/batch_count": 920, "throughput/flop_count": 0, "throughput/total_time": 20616.578735099058, "throughput/update_time": 11866.318235529237, "throughput/token_count_per_second_total_recent": 78076.89524095095, "throughput/token_count_per_second_total_cum": 93583.89986963711, "throughput/token_count_per_second_update_recent": 118602.85753906275, "throughput/token_count_per_second_update_cum": 162592.96284699292, "throughput/batch_count_per_second_total_recent": 0.037229964848018146, "throughput/batch_count_per_second_total_cum": 0.04462428086740356, "throughput/batch_count_per_second_update_recent": 0.05655424954369676, "throughput/batch_count_per_second_update_cum": 0.07753036634778639, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1950351360, "throughput/token_count": 1950351360, "throughput/batch_count": 930, "throughput/flop_count": 0, "throughput/total_time": 21010.326182788005, "throughput/update_time": 12031.375137096155, "throughput/token_count_per_second_total_recent": 71631.16431120914, "throughput/token_count_per_second_total_cum": 92828.22851164296, "throughput/token_count_per_second_update_recent": 117878.05106559339, "throughput/token_count_per_second_update_cum": 162105.43996641843, "throughput/batch_count_per_second_total_recent": 0.034156400828938074, "throughput/batch_count_per_second_total_cum": 0.04426394868452213, "throughput/batch_count_per_second_update_recent": 0.0562086348846404, "throughput/batch_count_per_second_update_cum": 0.07729789732285425, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1971322880, "throughput/token_count": 1971322880, "throughput/batch_count": 940, "throughput/flop_count": 0, "throughput/total_time": 21166.986645585042, "throughput/update_time": 12187.849889185163, "throughput/token_count_per_second_total_recent": 78793.19948096533, "throughput/token_count_per_second_total_cum": 93131.95652302136, "throughput/token_count_per_second_update_recent": 122161.03285477479, "throughput/token_count_per_second_update_cum": 161744.92612919732, "throughput/batch_count_per_second_total_recent": 0.03757152532623545, "throughput/batch_count_per_second_total_cum": 0.0444087774863345, "throughput/batch_count_per_second_update_recent": 0.0582509197496294, "throughput/batch_count_per_second_update_cum": 0.07712599092922083, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1992294400, "throughput/token_count": 1992294400, "throughput/batch_count": 950, "throughput/flop_count": 0, "throughput/total_time": 21589.57625599904, "throughput/update_time": 12373.759245545021, "throughput/token_count_per_second_total_recent": 72375.85836159, "throughput/token_count_per_second_total_cum": 92280.38458820638, "throughput/token_count_per_second_update_recent": 122366.1832409418, "throughput/token_count_per_second_update_cum": 161009.63017502497, "throughput/batch_count_per_second_total_recent": 0.03451149862365246, "throughput/batch_count_per_second_total_cum": 0.04400271634493178, "throughput/batch_count_per_second_update_recent": 0.05834874307677355, "throughput/batch_count_per_second_update_cum": 0.07677537449599503, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2013265920, "throughput/token_count": 2013265920, "throughput/batch_count": 960, "throughput/flop_count": 0, "throughput/total_time": 21767.68956994405, "throughput/update_time": 12551.347988351132, "throughput/token_count_per_second_total_recent": 78202.22616020792, "throughput/token_count_per_second_total_cum": 92488.72800813167, "throughput/token_count_per_second_update_recent": 120195.3869001219, "throughput/token_count_per_second_update_cum": 160402.3664923087, "throughput/batch_count_per_second_total_recent": 0.037289727287391626, "throughput/batch_count_per_second_total_cum": 0.04410206222921928, "throughput/batch_count_per_second_update_recent": 0.057313626718579244, "throughput/batch_count_per_second_update_cum": 0.07648580860724864, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2034237440, "throughput/token_count": 2034237440, "throughput/batch_count": 970, "throughput/flop_count": 0, "throughput/total_time": 22159.499484215048, "throughput/update_time": 12698.275234204368, "throughput/token_count_per_second_total_recent": 72048.6373016609, "throughput/token_count_per_second_total_cum": 91799.79184318018, "throughput/token_count_per_second_update_recent": 121436.47794114538, "throughput/token_count_per_second_update_cum": 160197.93259170593, "throughput/batch_count_per_second_total_recent": 0.03435546746333165, "throughput/batch_count_per_second_total_cum": 0.04377355186614045, "throughput/batch_count_per_second_update_recent": 0.05790542504365224, "throughput/batch_count_per_second_update_cum": 0.07638832692704484, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2055208960, "throughput/token_count": 2055208960, "throughput/batch_count": 980, "throughput/flop_count": 0, "throughput/total_time": 22335.74923630804, "throughput/update_time": 12874.329680579598, "throughput/token_count_per_second_total_recent": 78695.2394983399, "throughput/token_count_per_second_total_cum": 92014.32816317352, "throughput/token_count_per_second_update_recent": 123703.74057817468, "throughput/token_count_per_second_update_cum": 159636.19163024845, "throughput/batch_count_per_second_total_recent": 0.037524814366502714, "throughput/batch_count_per_second_total_cum": 0.04387585075529743, "throughput/batch_count_per_second_update_recent": 0.05898654011639341, "throughput/batch_count_per_second_update_cum": 0.07612046796333716, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} diff --git a/metrics/jsonlines/train.jsonl b/metrics/jsonlines/train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..464acf61776880e602c6366780f1ce358edd8db0 --- /dev/null +++ b/metrics/jsonlines/train.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 97.20068372006062, "train/update_time": 96.98365108307917, "train/lr": 0.0009000000000000001, "train/loss": 8.76564884185791, "train/global_grad_norm": 1.4130078554153442} +{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 190.58031271304935, "train/update_time": 190.21444558817893, "train/lr": 0.0009997960964140947, "train/loss": 7.474191188812256, "train/global_grad_norm": 0.8227528929710388} +{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 454.2337091190275, "train/update_time": 283.44298765144777, "train/lr": 0.0009990914580222257, "train/loss": 7.089620590209961, "train/global_grad_norm": 0.43369758129119873} +{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 547.6119900090853, "train/update_time": 376.67628215253353, "train/lr": 0.0009978842768382998, "train/loss": 6.745384216308594, "train/global_grad_norm": 0.30941468477249146} +{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 811.2910252050497, "train/update_time": 469.82893142174, "train/lr": 0.0009961757683914405, "train/loss": 6.485077857971191, "train/global_grad_norm": 0.3194890022277832} +{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 904.5979733880376, "train/update_time": 562.9825718456414, "train/lr": 0.00099396765300483, "train/loss": 6.212056636810303, "train/global_grad_norm": 0.42947375774383545} +{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 1167.0270900100004, "train/update_time": 656.2661910566967, "train/lr": 0.0009912621540634887, "train/loss": 6.0160932540893555, "train/global_grad_norm": 0.40116411447525024} +{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 1260.3716635780875, "train/update_time": 749.4701656188117, "train/lr": 0.000988061995775515, "train/loss": 5.953179359436035, "train/global_grad_norm": 1.0869561433792114} +{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 1522.4740127840778, "train/update_time": 842.772364483797, "train/lr": 0.0009843704004290394, "train/loss": 5.676019191741943, "train/global_grad_norm": 0.280754029750824} +{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1615.8070381759899, "train/update_time": 935.9601778858341, "train/lr": 0.0009801910851476522, "train/loss": 5.544191837310791, "train/global_grad_norm": 0.46917492151260376} +{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1880.450380234979, "train/update_time": 1029.0689218619373, "train/lr": 0.0009755282581475768, "train/loss": 5.429840087890625, "train/global_grad_norm": 0.7274503111839294} +{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1973.9993880910333, "train/update_time": 1122.4686564019648, "train/lr": 0.0009703866145003512, "train/loss": 5.2810444831848145, "train/global_grad_norm": 0.2969799339771271} +{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 2237.285542534082, "train/update_time": 1215.6973058142466, "train/lr": 0.0009647713314052896, "train/loss": 5.193550109863281, "train/global_grad_norm": 0.3795139491558075} +{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 2330.883187837084, "train/update_time": 1309.140196379507, "train/lr": 0.0009586880629764817, "train/loss": 5.119898319244385, "train/global_grad_norm": 0.43129095435142517} +{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 2593.152709970018, "train/update_time": 1402.5526540756691, "train/lr": 0.0009521429345495787, "train/loss": 4.974904537200928, "train/global_grad_norm": 0.3256843686103821} +{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 2686.8860215520253, "train/update_time": 1496.1389014086453, "train/lr": 0.0009451425365140996, "train/loss": 4.9173712730407715, "train/global_grad_norm": 0.39894500374794006} +{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 2949.368387905997, "train/update_time": 1589.4967352375388, "train/lr": 0.000937693917677468, "train/loss": 4.806858539581299, "train/global_grad_norm": 0.3854577839374542} +{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 3042.959401597036, "train/update_time": 1682.9467192575103, "train/lr": 0.0009298045781674596, "train/loss": 4.758651256561279, "train/global_grad_norm": 0.3088115155696869} +{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 3305.444267996005, "train/update_time": 1776.3336466513574, "train/lr": 0.0009214824618802108, "train/loss": 4.72238826751709, "train/global_grad_norm": 0.37908124923706055} +{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 3399.0298356669955, "train/update_time": 1869.7792980262311, "train/lr": 0.000912735948481387, "train/loss": 4.6122002601623535, "train/global_grad_norm": 0.4956414997577667} +{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 3663.3731433160137, "train/update_time": 1963.0694206270855, "train/lr": 0.0009035738449685707, "train/loss": 4.518575191497803, "train/global_grad_norm": 0.33904722332954407} +{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 3756.8895379800815, "train/update_time": 2056.446317301248, "train/lr": 0.0008940053768033609, "train/loss": 4.449559211730957, "train/global_grad_norm": 0.5069319605827332} +{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 4019.5361906640464, "train/update_time": 2149.77923363226, "train/lr": 0.0008840401786221159, "train/loss": 4.342103004455566, "train/global_grad_norm": 0.29293933510780334} +{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 4112.964794773026, "train/update_time": 2243.0702600192744, "train/lr": 0.0008736882845346905, "train/loss": 4.284090518951416, "train/global_grad_norm": 0.36110353469848633} +{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 4376.393655605032, "train/update_time": 2336.412492537289, "train/lr": 0.0008629601180209381, "train/loss": 4.201655387878418, "train/global_grad_norm": 0.3496641218662262} +{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 4470.06449480704, "train/update_time": 2429.936827432248, "train/lr": 0.0008518664814351503, "train/loss": 4.142204761505127, "train/global_grad_norm": 0.39031898975372314} +{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 4734.132506006979, "train/update_time": 2523.346581262187, "train/lr": 0.0008404185451290017, "train/loss": 4.099599838256836, "train/global_grad_norm": 0.37471985816955566} +{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 4827.830736390082, "train/update_time": 2616.8884120163275, "train/lr": 0.0008286278362039527, "train/loss": 4.009082317352295, "train/global_grad_norm": 0.34484514594078064} +{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 5090.937317650998, "train/update_time": 2710.3318849493517, "train/lr": 0.0008165062269044352, "train/loss": 3.97619891166687, "train/global_grad_norm": 0.40576431155204773} +{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 5184.687426269054, "train/update_time": 2803.9371257049497, "train/lr": 0.0008040659226635089, "train/loss": 3.9234979152679443, "train/global_grad_norm": 0.44408538937568665} +{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 5450.616348118056, "train/update_time": 2897.275741666206, "train/lr": 0.0007913194498130252, "train/loss": 3.9410295486450195, "train/global_grad_norm": 0.43244609236717224} +{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 5543.975930526038, "train/update_time": 2990.483508925303, "train/lr": 0.000778279642970672, "train/loss": 3.8507468700408936, "train/global_grad_norm": 0.48972949385643005} +{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 5806.160792221082, "train/update_time": 3083.611473838333, "train/lr": 0.0007649596321166025, "train/loss": 3.8703322410583496, "train/global_grad_norm": 0.3615987300872803} +{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 5899.68232584605, "train/update_time": 3176.9836701630848, "train/lr": 0.0007513728293726579, "train/loss": 3.8107964992523193, "train/global_grad_norm": 0.36133530735969543} +{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 6162.256411690032, "train/update_time": 3270.438694642042, "train/lr": 0.0007375329154974975, "train/loss": 3.7763936519622803, "train/global_grad_norm": 0.40389642119407654} +{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 6255.94410856301, "train/update_time": 3363.9935755479382, "train/lr": 0.0007234538261112341, "train/loss": 3.7278685569763184, "train/global_grad_norm": 0.42608577013015747} +{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 6518.670226996997, "train/update_time": 3457.366311661084, "train/lr": 0.0007091497376634464, "train/loss": 3.7271111011505127, "train/global_grad_norm": 0.27613717317581177} +{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 6612.111398222041, "train/update_time": 3550.66807099001, "train/lr": 0.0006946350531586958, "train/loss": 3.699812412261963, "train/global_grad_norm": 0.3713103234767914} +{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 6874.636505312985, "train/update_time": 3643.886076678871, "train/lr": 0.0006799243876539214, "train/loss": 3.6990554332733154, "train/global_grad_norm": 0.2907034456729889} +{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 6968.171832425054, "train/update_time": 3737.2908036899753, "train/lr": 0.0006650325535423166, "train/loss": 3.581249475479126, "train/global_grad_norm": 0.4145627021789551} +{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 7233.323964387993, "train/update_time": 3830.6057550550904, "train/lr": 0.0006499745456385053, "train/loss": 3.623004913330078, "train/global_grad_norm": 0.3520248532295227} +{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 7326.922928820015, "train/update_time": 3924.0719714582665, "train/lr": 0.0006347655260800339, "train/loss": 3.6238791942596436, "train/global_grad_norm": 0.379281610250473} +{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 7591.2802984340815, "train/update_time": 4017.3633328623837, "train/lr": 0.0006194208090603844, "train/loss": 3.62347412109375, "train/global_grad_norm": 0.36047083139419556} +{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 7684.845341357053, "train/update_time": 4110.783207958448, "train/lr": 0.0006039558454088796, "train/loss": 3.6568057537078857, "train/global_grad_norm": 0.49488669633865356} +{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 7948.907636292046, "train/update_time": 4204.296549194376, "train/lr": 0.0005883862070330078, "train/loss": 3.6046764850616455, "train/global_grad_norm": 0.3740111291408539} +{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 8042.599143097992, "train/update_time": 4297.83159168635, "train/lr": 0.0005727275712388317, "train/loss": 3.572643280029297, "train/global_grad_norm": 0.3081468641757965} +{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 8305.275031784084, "train/update_time": 4391.248921588529, "train/lr": 0.0005569957049452703, "train/loss": 3.553377389907837, "train/global_grad_norm": 0.33262893557548523} +{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 8398.81305668701, "train/update_time": 4484.626386589487, "train/lr": 0.0005412064488081482, "train/loss": 3.564133644104004, "train/global_grad_norm": 0.4224238991737366} +{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 8662.347046800074, "train/update_time": 4577.979232876445, "train/lr": 0.0005253757012699972, "train/loss": 3.5546345710754395, "train/global_grad_norm": 0.2766055762767792} +{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 8755.968361919047, "train/update_time": 4671.454424570431, "train/lr": 0.0005095194025516734, "train/loss": 3.536339044570923, "train/global_grad_norm": 0.3986065685749054} +{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 9021.309424316045, "train/update_time": 4764.801301084226, "train/lr": 0.0004936535186019053, "train/loss": 3.5047459602355957, "train/global_grad_norm": 0.244685560464859} +{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 9114.625705051003, "train/update_time": 4857.963408218231, "train/lr": 0.00047779402502093696, "train/loss": 3.520270586013794, "train/global_grad_norm": 0.4062435030937195} +{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 9398.054264329025, "train/update_time": 4970.6049944133265, "train/lr": 0.0004619568909744525, "train/loss": 3.4980483055114746, "train/global_grad_norm": 0.3901892602443695} +{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 9606.105255214032, "train/update_time": 5178.118606777163, "train/lr": 0.00044615806311398067, "train/loss": 3.511883497238159, "train/global_grad_norm": 0.34147942066192627} +{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 10003.42233235808, "train/update_time": 5362.020449607167, "train/lr": 0.0004304134495199673, "train/loss": 3.468095064163208, "train/global_grad_norm": 0.3075481653213501} +{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 10150.524502051994, "train/update_time": 5508.941767138196, "train/lr": 0.0004147389036836882, "train/loss": 3.5062997341156006, "train/global_grad_norm": 0.2764611542224884} +{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 10566.198982721078, "train/update_time": 5678.196010601241, "train/lr": 0.0003991502085441259, "train/loss": 3.4300529956817627, "train/global_grad_norm": 0.29730668663978577} +{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 10760.646769667044, "train/update_time": 5872.135236821254, "train/lr": 0.0003836630605958888, "train/loss": 3.489516258239746, "train/global_grad_norm": 0.32831132411956787} +{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 11156.295638620039, "train/update_time": 6039.766134388163, "train/lr": 0.00036829305408417155, "train/loss": 3.5055065155029297, "train/global_grad_norm": 0.3446820080280304} +{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 11310.365665178979, "train/update_time": 6193.65441437799, "train/lr": 0.000353055665302672, "train/loss": 3.5088326930999756, "train/global_grad_norm": 0.2981908321380615} +{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 11736.789658116992, "train/update_time": 6379.89381142694, "train/lr": 0.0003379662370102746, "train/loss": 3.466792345046997, "train/global_grad_norm": 0.33163660764694214} +{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 11914.080722624087, "train/update_time": 6556.690899457899, "train/lr": 0.00032303996298219405, "train/loss": 3.4406988620758057, "train/global_grad_norm": 0.28660744428634644} +{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 12308.019568020012, "train/update_time": 6703.663481310825, "train/lr": 0.00030829187271113034, "train/loss": 3.4547548294067383, "train/global_grad_norm": 0.29186901450157166} +{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 12483.778021455975, "train/update_time": 6879.228759005899, "train/lr": 0.0002937368162738445, "train/loss": 3.4471235275268555, "train/global_grad_norm": 0.27723199129104614} +{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 12905.641336273053, "train/update_time": 7080.2444989720825, "train/lr": 0.0002793894493783894, "train/loss": 3.409696578979492, "train/global_grad_norm": 0.2166973203420639} +{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 13068.73947558098, "train/update_time": 7242.851195934112, "train/lr": 0.00026526421860705474, "train/loss": 3.4526355266571045, "train/global_grad_norm": 0.3342931270599365} +{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 13461.124407541007, "train/update_time": 7389.911005128175, "train/lr": 0.0002513753468698824, "train/loss": 3.3823962211608887, "train/global_grad_norm": 0.24517573416233063} +{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 13651.783316303976, "train/update_time": 7580.372131183161, "train/lr": 0.00023773681908340283, "train/loss": 3.412515163421631, "train/global_grad_norm": 0.3234269320964813} +{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 14071.216295699007, "train/update_time": 7793.8012771270005, "train/lr": 0.00022436236808900823, "train/loss": 3.4163782596588135, "train/global_grad_norm": 0.22176125645637512} +{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 14221.494554364006, "train/update_time": 7943.600337238051, "train/lr": 0.00021126546082514682, "train/loss": 3.4145960807800293, "train/global_grad_norm": 0.24960394203662872} +{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 14616.199775510002, "train/update_time": 8090.416964606033, "train/lr": 0.00019845928476725522, "train/loss": 3.4254729747772217, "train/global_grad_norm": 0.21741226315498352} +{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 14821.623726570979, "train/update_time": 8295.638736715424, "train/lr": 0.0001859567346490913, "train/loss": 3.3944356441497803, "train/global_grad_norm": 0.2173576056957245} +{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 15232.941850902047, "train/update_time": 8512.26620015048, "train/lr": 0.00017377039947882782, "train/loss": 3.4161322116851807, "train/global_grad_norm": 0.19619792699813843} +{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 15380.238736488041, "train/update_time": 8659.387788099353, "train/lr": 0.00016191254986299043, "train/loss": 3.377074718475342, "train/global_grad_norm": 0.18498826026916504} +{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 15773.814038521028, "train/update_time": 8806.402406750247, "train/lr": 0.00015039512565099468, "train/loss": 3.3878912925720215, "train/global_grad_norm": 0.23870347440242767} +{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 15988.252661015023, "train/update_time": 9020.618778260308, "train/lr": 0.00013922972391273224, "train/loss": 3.3374383449554443, "train/global_grad_norm": 0.19240827858448029} +{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 16392.140212937025, "train/update_time": 9237.03149983345, "train/lr": 0.00012842758726130281, "train/loss": 3.408134937286377, "train/global_grad_norm": 0.20149269700050354} +{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 16539.33981846203, "train/update_time": 9384.053210339276, "train/lr": 0.00011799959253265679, "train/loss": 3.354210615158081, "train/global_grad_norm": 0.18948209285736084} +{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 16932.73472051299, "train/update_time": 9530.820382265258, "train/lr": 0.00010795623983354214, "train/loss": 3.369267225265503, "train/global_grad_norm": 0.1685154139995575} +{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 17149.437690322986, "train/update_time": 9747.28814456216, "train/lr": 9.830764196878872e-05, "train/loss": 3.3556768894195557, "train/global_grad_norm": 0.1737906038761139} +{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 17549.297566386987, "train/update_time": 9954.213539150194, "train/lr": 8.906351425856951e-05, "train/loss": 3.3356785774230957, "train/global_grad_norm": 0.1702912300825119} +{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 17696.358517175075, "train/update_time": 10101.101109507843, "train/lr": 8.02331647558977e-05, "train/loss": 3.3448057174682617, "train/global_grad_norm": 0.16285409033298492} +{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 18089.859054057975, "train/update_time": 10248.098395384499, "train/lr": 7.182548487420554e-05, "train/loss": 3.379516363143921, "train/global_grad_norm": 0.1631559580564499} +{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 18306.998820198, "train/update_time": 10465.017247352516, "train/lr": 6.384894043444556e-05, "train/loss": 3.309666872024536, "train/global_grad_norm": 0.15072140097618103} +{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 18706.20748156903, "train/update_time": 10662.364285671501, "train/lr": 5.6311563140726166e-05, "train/loss": 3.3882648944854736, "train/global_grad_norm": 0.15730835497379303} +{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 18853.344159234082, "train/update_time": 10809.319413250545, "train/lr": 4.922094249306547e-05, "train/loss": 3.371917724609375, "train/global_grad_norm": 0.1638113260269165} +{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 19254.14253187005, "train/update_time": 10965.228872205596, "train/lr": 4.2584218145409916e-05, "train/loss": 3.314046621322632, "train/global_grad_norm": 0.14649039506912231} +{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 19462.212121990975, "train/update_time": 11172.7729124584, "train/lr": 3.6408072716606236e-05, "train/loss": 3.328913688659668, "train/global_grad_norm": 0.14682640135288239} +{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 19858.031892596046, "train/update_time": 11356.353174015298, "train/lr": 3.069872506157217e-05, "train/loss": 3.39823842048645, "train/global_grad_norm": 0.14888012409210205} +{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 20004.831071814988, "train/update_time": 11502.97223422851, "train/lr": 2.5461924009435368e-05, "train/loss": 3.309504508972168, "train/global_grad_norm": 0.13238337635993958} +{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 20424.197218824993, "train/update_time": 11674.226087162388, "train/lr": 2.0702942574950812e-05, "train/loss": 3.359825849533081, "train/global_grad_norm": 0.14266924560070038} +{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 20616.578735099058, "train/update_time": 11866.318235529237, "train/lr": 1.642657264902142e-05, "train/loss": 3.3724734783172607, "train/global_grad_norm": 0.1331755369901657} +{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 21010.326182788005, "train/update_time": 12031.375137096155, "train/lr": 1.2637120173670358e-05, "train/loss": 3.365408182144165, "train/global_grad_norm": 0.12957783043384552} +{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 21166.986645585042, "train/update_time": 12187.849889185163, "train/lr": 9.338400806321978e-06, "train/loss": 3.319882869720459, "train/global_grad_norm": 0.12174921482801437} +{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 21589.57625599904, "train/update_time": 12373.759245545021, "train/lr": 6.533736077758867e-06, "train/loss": 3.326748847961426, "train/global_grad_norm": 0.120763398706913} +{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 21767.68956994405, "train/update_time": 12551.347988351132, "train/lr": 4.2259500476214406e-06, "train/loss": 3.3314108848571777, "train/global_grad_norm": 0.12533509731292725} +{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 22159.499484215048, "train/update_time": 12698.275234204368, "train/lr": 2.417366460819359e-06, "train/loss": 3.3585853576660156, "train/global_grad_norm": 0.11902791261672974} +{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 22335.74923630804, "train/update_time": 12874.329680579598, "train/lr": 1.1098064077174619e-06, "train/loss": 3.3340089321136475, "train/global_grad_norm": 0.1221228837966919} diff --git a/metrics/jsonlines/train_data_info.jsonl b/metrics/jsonlines/train_data_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8f2d82aa96124ce343d2b13b9c030bd8eadda7c6 --- /dev/null +++ b/metrics/jsonlines/train_data_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024} diff --git a/metrics/jsonlines/train_eval.jsonl b/metrics/jsonlines/train_eval.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..24b4e3f50d6ddf8c03ffab77051f30637ff0606f --- /dev/null +++ b/metrics/jsonlines/train_eval.jsonl @@ -0,0 +1,19 @@ +{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 811.2910252050497, "train_eval/train_update_time": 469.82893142174, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 7.719095360425272, "train_eval/perplexity_len_2048": 2250.922385774329, "train_eval/loss_avg_len_1024": 7.719504372100928, "train_eval/perplexity_len_1024": 2251.843227615802, "train_eval/loss_avg_len_512": 7.719377458653034, "train_eval/perplexity_len_512": 2251.5574565621464} +{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1615.8070381759899, "train_eval/train_update_time": 935.9601778858341, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.957939318534446, "train_eval/perplexity_len_2048": 386.81220564928583, "train_eval/loss_avg_len_1024": 5.961814671365428, "train_eval/perplexity_len_1024": 388.31414782371087, "train_eval/loss_avg_len_512": 5.967393189926952, "train_eval/perplexity_len_512": 390.4864188989083} +{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2593.152709970018, "train_eval/train_update_time": 1402.5526540756691, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.251482137483999, "train_eval/perplexity_len_2048": 190.8489232832105, "train_eval/loss_avg_len_1024": 5.2583443090137365, "train_eval/perplexity_len_1024": 192.16306510751045, "train_eval/loss_avg_len_512": 5.272100609642803, "train_eval/perplexity_len_512": 194.824783727455} +{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3399.0298356669955, "train_eval/train_update_time": 1869.7792980262311, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.78972842358995, "train_eval/perplexity_len_2048": 120.26870208532655, "train_eval/loss_avg_len_1024": 4.803808954130364, "train_eval/perplexity_len_1024": 121.974127690181, "train_eval/loss_avg_len_512": 4.828457922573944, "train_eval/perplexity_len_512": 125.0180244583548} +{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4376.393655605032, "train_eval/train_update_time": 2336.412492537289, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.40212876008818, "train_eval/perplexity_len_2048": 81.6244427065281, "train_eval/loss_avg_len_1024": 4.431100291744224, "train_eval/perplexity_len_1024": 84.02381677946093, "train_eval/loss_avg_len_512": 4.4801360191248385, "train_eval/perplexity_len_512": 88.24667509487} +{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5184.687426269054, "train_eval/train_update_time": 2803.9371257049497, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.066799368759794, "train_eval/perplexity_len_2048": 58.36984295830003, "train_eval/loss_avg_len_1024": 4.114848925766128, "train_eval/perplexity_len_1024": 61.2429613303667, "train_eval/loss_avg_len_512": 4.189334867373182, "train_eval/perplexity_len_512": 65.97889164907912} +{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6162.256411690032, "train_eval/train_update_time": 3270.438694642042, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.8533093321500744, "train_eval/perplexity_len_2048": 47.148836497135065, "train_eval/loss_avg_len_1024": 3.9121336809778766, "train_eval/perplexity_len_1024": 50.00553408372476, "train_eval/loss_avg_len_512": 3.9987834947431473, "train_eval/perplexity_len_512": 54.531771479730125} +{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6968.171832425054, "train_eval/train_update_time": 3737.2908036899753, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.722734505148255, "train_eval/perplexity_len_2048": 41.37738622509416, "train_eval/loss_avg_len_1024": 3.785098307309199, "train_eval/perplexity_len_1024": 44.039999799825864, "train_eval/loss_avg_len_512": 3.878930976334377, "train_eval/perplexity_len_512": 48.3724760994647} +{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7948.907636292046, "train_eval/train_update_time": 4204.296549194376, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.6312928089601515, "train_eval/perplexity_len_2048": 37.76160361375511, "train_eval/loss_avg_len_1024": 3.6986304339887464, "train_eval/perplexity_len_1024": 40.39194702318541, "train_eval/loss_avg_len_512": 3.7954792477576484, "train_eval/perplexity_len_512": 44.499557609251696} +{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8755.968361919047, "train_eval/train_update_time": 4671.454424570431, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.566865509475556, "train_eval/perplexity_len_2048": 35.40544102059099, "train_eval/loss_avg_len_1024": 3.6334003103244683, "train_eval/perplexity_len_1024": 37.841270164084655, "train_eval/loss_avg_len_512": 3.731213201559185, "train_eval/perplexity_len_512": 41.72970400831208} +{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10003.42233235808, "train_eval/train_update_time": 5362.020449607167, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.5099057611342683, "train_eval/perplexity_len_2048": 33.445115805650026, "train_eval/loss_avg_len_1024": 3.5771838447961635, "train_eval/perplexity_len_1024": 35.77265750657162, "train_eval/loss_avg_len_512": 3.674278985551646, "train_eval/perplexity_len_512": 39.420224057099915} +{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11310.365665178979, "train_eval/train_update_time": 6193.65441437799, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.4691960758760523, "train_eval/perplexity_len_2048": 32.110917324346254, "train_eval/loss_avg_len_1024": 3.5349628742179267, "train_eval/perplexity_len_1024": 34.29374150862089, "train_eval/loss_avg_len_512": 3.634656662457237, "train_eval/perplexity_len_512": 37.88884200179451} +{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 12905.641336273053, "train_eval/train_update_time": 7080.2444989720825, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.437814879390981, "train_eval/perplexity_len_2048": 31.118885304078464, "train_eval/loss_avg_len_1024": 3.508632168282675, "train_eval/perplexity_len_1024": 33.402547458364594, "train_eval/loss_avg_len_512": 3.6075781844702215, "train_eval/perplexity_len_512": 36.876636173050585} +{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 14221.494554364006, "train_eval/train_update_time": 7943.600337238051, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.4099421536454972, "train_eval/perplexity_len_2048": 30.263493575987354, "train_eval/loss_avg_len_1024": 3.4811164568568342, "train_eval/perplexity_len_1024": 32.49598219280798, "train_eval/loss_avg_len_512": 3.5819485398161484, "train_eval/perplexity_len_512": 35.94351001646744} +{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 15773.814038521028, "train_eval/train_update_time": 8806.402406750247, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.389845041899543, "train_eval/perplexity_len_2048": 29.66135564690801, "train_eval/loss_avg_len_1024": 3.4604284336998896, "train_eval/perplexity_len_1024": 31.830610900141725, "train_eval/loss_avg_len_512": 3.562013453548534, "train_eval/perplexity_len_512": 35.234067932753184} +{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 17149.437690322986, "train_eval/train_update_time": 9747.28814456216, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.3741092623544136, "train_eval/perplexity_len_2048": 29.19826420153563, "train_eval/loss_avg_len_1024": 3.446687585645268, "train_eval/perplexity_len_1024": 31.396222578985455, "train_eval/loss_avg_len_512": 3.547421754104362, "train_eval/perplexity_len_512": 34.723675803484795} +{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 18706.20748156903, "train_eval/train_update_time": 10662.364285671501, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.356781773958419, "train_eval/perplexity_len_2048": 28.696689680159, "train_eval/loss_avg_len_1024": 3.4224050301090756, "train_eval/perplexity_len_1024": 30.643023864854985, "train_eval/loss_avg_len_512": 3.523673592355699, "train_eval/perplexity_len_512": 33.90876692657728} +{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 20004.831071814988, "train_eval/train_update_time": 11502.97223422851, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.3524864197691384, "train_eval/perplexity_len_2048": 28.57369158322852, "train_eval/loss_avg_len_1024": 3.4213956192670048, "train_eval/perplexity_len_1024": 30.612108070326094, "train_eval/loss_avg_len_512": 3.522944682840425, "train_eval/perplexity_len_512": 33.8840595095429} +{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 21589.57625599904, "train_eval/train_update_time": 12373.759245545021, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 3.342387769561974, "train_eval/perplexity_len_2048": 28.28658798616546, "train_eval/loss_avg_len_1024": 3.414338314473171, "train_eval/perplexity_len_1024": 30.396829629582328, "train_eval/loss_avg_len_512": 3.5140870866559273, "train_eval/perplexity_len_512": 33.58525349842418} diff --git a/metrics/jsonlines/val.jsonl b/metrics/jsonlines/val.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b91315d01f2b2af588ff97ac9e0a4d3027d5d96 --- /dev/null +++ b/metrics/jsonlines/val.jsonl @@ -0,0 +1,49 @@ +{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 190.58031271304935, "val/train_update_time": 190.21444558817893, "val/loss": 7.450875304234563, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.25829177699052, "val/val_tokens_per_second": 240575.65462744483, "val/loss_avg_len_2048": 7.450875304234563, "val/perplexity_len_2048": 1721.3692079052364, "val/loss_avg_len_1024": 7.449340622551227, "val/perplexity_len_1024": 1718.7294802005856, "val/loss_avg_len_512": 7.449990987825952, "val/perplexity_len_512": 1719.8476457397608} +{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 547.6119900090853, "val/train_update_time": 376.67628215253353, "val/loss": 6.7323331680220555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.36321504006628, "val/val_tokens_per_second": 240427.48894100738, "val/loss_avg_len_2048": 6.7323331680220555, "val/perplexity_len_2048": 839.1027515359466, "val/loss_avg_len_1024": 6.731641844541114, "val/perplexity_len_1024": 838.5228605701034, "val/loss_avg_len_512": 6.734119094878901, "val/perplexity_len_512": 840.6026666462375} +{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 904.5979733880376, "val/train_update_time": 562.9825718456414, "val/loss": 6.202562174819015, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.99627296999097, "val/val_tokens_per_second": 242372.20904435776, "val/loss_avg_len_2048": 6.202562174819015, "val/perplexity_len_2048": 494.01316904528653, "val/loss_avg_len_1024": 6.203721904096799, "val/perplexity_len_1024": 494.5864229264433, "val/loss_avg_len_512": 6.209107841835264, "val/perplexity_len_512": 497.25742106468925} +{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 1260.3716635780875, "val/train_update_time": 749.4701656188117, "val/loss": 5.854788969698548, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.6496025499655, "val/val_tokens_per_second": 242870.42116132387, "val/loss_avg_len_2048": 5.854788969698548, "val/perplexity_len_2048": 348.9012635456685, "val/loss_avg_len_1024": 5.85749615726755, "val/perplexity_len_1024": 349.84708438871354, "val/loss_avg_len_512": 5.865908991570771, "val/perplexity_len_512": 352.8027050825446} +{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1615.8070381759899, "val/train_update_time": 935.9601778858341, "val/loss": 5.549726753812819, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.84832797700074, "val/val_tokens_per_second": 241156.33334669282, "val/loss_avg_len_2048": 5.549726753812819, "val/perplexity_len_2048": 257.16727632667454, "val/loss_avg_len_1024": 5.554625083155884, "val/perplexity_len_1024": 258.4300565742626, "val/loss_avg_len_512": 5.565984733520728, "val/perplexity_len_512": 261.38246909910544} +{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1973.9993880910333, "val/train_update_time": 1122.4686564019648, "val/loss": 5.291026324360166, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.91353944293223, "val/val_tokens_per_second": 241063.779462713, "val/loss_avg_len_2048": 5.291026324360166, "val/perplexity_len_2048": 198.5470945960036, "val/loss_avg_len_1024": 5.297933264936833, "val/perplexity_len_1024": 199.92319442927527, "val/loss_avg_len_512": 5.312294404878934, "val/perplexity_len_512": 202.81503476251478} +{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 2330.883187837084, "val/train_update_time": 1309.140196379507, "val/loss": 5.082773989376356, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.71492548892274, "val/val_tokens_per_second": 242776.3867440957, "val/loss_avg_len_2048": 5.082773989376356, "val/perplexity_len_2048": 161.22066060367317, "val/loss_avg_len_1024": 5.091564861865482, "val/perplexity_len_1024": 162.64417868926273, "val/loss_avg_len_512": 5.109285918890592, "val/perplexity_len_512": 165.55209503151025} +{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 2686.8860215520253, "val/train_update_time": 1496.1389014086453, "val/loss": 4.900254244546081, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.97393367695622, "val/val_tokens_per_second": 242404.25199727662, "val/loss_avg_len_2048": 4.900254244546081, "val/perplexity_len_2048": 134.3239264696477, "val/loss_avg_len_1024": 4.91143611547025, "val/perplexity_len_1024": 135.83434821009394, "val/loss_avg_len_512": 4.93267407396678, "val/perplexity_len_512": 138.7500445859649} +{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 3042.959401597036, "val/train_update_time": 1682.9467192575103, "val/loss": 4.739786327227112, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.96092426101677, "val/val_tokens_per_second": 242422.91629941342, "val/loss_avg_len_2048": 4.739786327227112, "val/perplexity_len_2048": 114.40975281909341, "val/loss_avg_len_1024": 4.754103896766296, "val/perplexity_len_1024": 116.05960516499653, "val/loss_avg_len_512": 4.780596644866559, "val/perplexity_len_512": 119.17543424775661} +{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 3399.0298356669955, "val/train_update_time": 1869.7792980262311, "val/loss": 4.605688608644693, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.33557621506043, "val/val_tokens_per_second": 241886.5599038667, "val/loss_avg_len_2048": 4.605688608644693, "val/perplexity_len_2048": 100.05185570608518, "val/loss_avg_len_1024": 4.625144383868621, "val/perplexity_len_1024": 102.0175017009493, "val/loss_avg_len_512": 4.659852839846723, "val/perplexity_len_512": 105.62053787839496} +{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 3756.8895379800815, "val/train_update_time": 2056.446317301248, "val/loss": 4.442701461410919, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.17154206091072, "val/val_tokens_per_second": 242121.1008719908, "val/loss_avg_len_2048": 4.442701461410919, "val/perplexity_len_2048": 85.0042675253757, "val/loss_avg_len_1024": 4.471159290939104, "val/perplexity_len_1024": 87.4580535953644, "val/loss_avg_len_512": 4.518184423743747, "val/perplexity_len_512": 91.66901471505673} +{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 4112.964794773026, "val/train_update_time": 2243.0702600192744, "val/loss": 4.2837813609120206, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.93256872706115, "val/val_tokens_per_second": 241036.78480720375, "val/loss_avg_len_2048": 4.2837813609120206, "val/perplexity_len_2048": 72.51412430688201, "val/loss_avg_len_1024": 4.324054979363084, "val/perplexity_len_1024": 75.4941356224282, "val/loss_avg_len_512": 4.385897227285058, "val/perplexity_len_512": 80.31024744229244} +{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 4470.06449480704, "val/train_update_time": 2429.936827432248, "val/loss": 4.142301944319485, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.5037173400633, "val/val_tokens_per_second": 240229.36648534652, "val/loss_avg_len_2048": 4.142301944319485, "val/perplexity_len_2048": 62.94755656946625, "val/loss_avg_len_1024": 4.190639730950771, "val/perplexity_len_1024": 66.0650412962264, "val/loss_avg_len_512": 4.262273591935914, "val/perplexity_len_512": 70.97115961662232} +{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 4827.830736390082, "val/train_update_time": 2616.8884120163275, "val/loss": 4.03030806598065, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.51862163702026, "val/val_tokens_per_second": 241625.3719175768, "val/loss_avg_len_2048": 4.03030806598065, "val/perplexity_len_2048": 56.27824598990862, "val/loss_avg_len_1024": 4.084947683577659, "val/perplexity_len_1024": 59.43882804306883, "val/loss_avg_len_512": 4.163849879008346, "val/perplexity_len_512": 64.31866563997396} +{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 5184.687426269054, "val/train_update_time": 2803.9371257049497, "val/loss": 3.93689947860057, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.90034897695296, "val/val_tokens_per_second": 239671.83358720774, "val/loss_avg_len_2048": 3.93689947860057, "val/perplexity_len_2048": 51.259423721129075, "val/loss_avg_len_1024": 3.9956295500188603, "val/perplexity_len_1024": 54.36005222562956, "val/loss_avg_len_512": 4.079135575135518, "val/perplexity_len_512": 59.09436512640163} +{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 5543.975930526038, "val/train_update_time": 2990.483508925303, "val/loss": 3.873067468292034, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.9153455970809, "val/val_tokens_per_second": 242488.3296139546, "val/loss_avg_len_2048": 3.873067468292034, "val/perplexity_len_2048": 48.08967361439601, "val/loss_avg_len_1024": 3.935505072252453, "val/perplexity_len_1024": 51.18799706575565, "val/loss_avg_len_512": 4.0234172149434695, "val/perplexity_len_512": 55.891774069637314} +{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 5899.68232584605, "val/train_update_time": 3176.9836701630848, "val/loss": 3.8065103932873816, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 168.98372465907596, "val/val_tokens_per_second": 242390.2070015124, "val/loss_avg_len_2048": 3.8065103932873816, "val/perplexity_len_2048": 44.99315617874155, "val/loss_avg_len_1024": 3.8711530834252486, "val/perplexity_len_1024": 47.99769953596133, "val/loss_avg_len_512": 3.961617600578815, "val/perplexity_len_512": 52.542249617687304} +{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 6255.94410856301, "val/train_update_time": 3363.9935755479382, "val/loss": 3.7648755942319054, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.21118752204347, "val/val_tokens_per_second": 242064.37292843926, "val/loss_avg_len_2048": 3.7648755942319054, "val/perplexity_len_2048": 43.158336379882776, "val/loss_avg_len_1024": 3.8314554831946737, "val/perplexity_len_1024": 46.129630298360155, "val/loss_avg_len_512": 3.923248363528401, "val/perplexity_len_512": 50.564429942215185} +{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 6612.111398222041, "val/train_update_time": 3550.66807099001, "val/loss": 3.7117092974703993, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.1644801320508, "val/val_tokens_per_second": 242131.20844296852, "val/loss_avg_len_2048": 3.7117092974703993, "val/perplexity_len_2048": 40.923697550401094, "val/loss_avg_len_1024": 3.7795683319184, "val/perplexity_len_1024": 43.797131830589514, "val/loss_avg_len_512": 3.873812707895227, "val/perplexity_len_512": 48.125525301067185} +{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 6968.171832425054, "val/train_update_time": 3737.2908036899753, "val/loss": 3.6820711012163434, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.1077867250424, "val/val_tokens_per_second": 240788.50703176, "val/loss_avg_len_2048": 3.6820711012163434, "val/perplexity_len_2048": 39.72859085710849, "val/loss_avg_len_1024": 3.75097957880958, "val/perplexity_len_1024": 42.56275515868964, "val/loss_avg_len_512": 3.8466630883938633, "val/perplexity_len_512": 46.836512876088385} +{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 7326.922928820015, "val/train_update_time": 3924.0719714582665, "val/loss": 3.641011830084678, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.91282955405768, "val/val_tokens_per_second": 239654.33201750863, "val/loss_avg_len_2048": 3.641011830084678, "val/perplexity_len_2048": 38.13039869757022, "val/loss_avg_len_1024": 3.710436346722953, "val/perplexity_len_1024": 40.871636841405206, "val/loss_avg_len_512": 3.806864157543611, "val/perplexity_len_512": 45.009075964932165} +{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 7684.845341357053, "val/train_update_time": 4110.783207958448, "val/loss": 3.61963528591136, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.39240577502642, "val/val_tokens_per_second": 240386.30016223004, "val/loss_avg_len_2048": 3.61963528591136, "val/perplexity_len_2048": 37.32395276798534, "val/loss_avg_len_1024": 3.6890140571806116, "val/perplexity_len_1024": 40.00538448504298, "val/loss_avg_len_512": 3.7862415502706543, "val/perplexity_len_512": 44.09037701082852} +{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 8042.599143097992, "val/train_update_time": 4297.83159168635, "val/loss": 3.5845805286582095, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 169.11018539394718, "val/val_tokens_per_second": 242208.94740658268, "val/loss_avg_len_2048": 3.5845805286582095, "val/perplexity_len_2048": 36.03823753998343, "val/loss_avg_len_1024": 3.6556578606538475, "val/perplexity_len_1024": 38.69296732499727, "val/loss_avg_len_512": 3.753422931008786, "val/perplexity_len_512": 42.66687811283449} +{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 8398.81305668701, "val/train_update_time": 4484.626386589487, "val/loss": 3.5655147499182727, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.01293277891818, "val/val_tokens_per_second": 240922.84822392694, "val/loss_avg_len_2048": 3.5655147499182727, "val/perplexity_len_2048": 35.35764906773771, "val/loss_avg_len_1024": 3.6369095906229223, "val/perplexity_len_1024": 37.97429906927659, "val/loss_avg_len_512": 3.736021636988502, "val/perplexity_len_512": 41.93084178698823} +{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 8755.968361919047, "val/train_update_time": 4671.454424570431, "val/loss": 3.540912869877019, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.22161650797352, "val/val_tokens_per_second": 240627.48809626861, "val/loss_avg_len_2048": 3.540912869877019, "val/perplexity_len_2048": 34.498397368694555, "val/loss_avg_len_1024": 3.612297219974548, "val/perplexity_len_1024": 37.05106958345076, "val/loss_avg_len_512": 3.7114113950682803, "val/perplexity_len_512": 40.91150809832091} +{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 9114.625705051003, "val/train_update_time": 4857.963408218231, "val/loss": 3.523965722436574, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 170.6301947310567, "val/val_tokens_per_second": 240051.29962232176, "val/loss_avg_len_2048": 3.523965722436574, "val/perplexity_len_2048": 33.91867414442962, "val/loss_avg_len_1024": 3.5953900306275113, "val/perplexity_len_1024": 36.42990599640571, "val/loss_avg_len_512": 3.694821816809941, "val/perplexity_len_512": 40.238402142307585} +{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 9606.105255214032, "val/train_update_time": 5178.118606777163, "val/loss": 3.5034193539332135, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 213.20502558199223, "val/val_tokens_per_second": 192115.546470775, "val/loss_avg_len_2048": 3.5034193539332135, "val/perplexity_len_2048": 33.228879223062926, "val/loss_avg_len_1024": 3.575615437129885, "val/perplexity_len_1024": 35.71659537192833, "val/loss_avg_len_512": 3.6756443774067797, "val/perplexity_len_512": 39.47408487214642} +{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 10150.524502051994, "val/train_update_time": 5508.941767138196, "val/loss": 3.485983414201392, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.22538287297357, "val/val_tokens_per_second": 166351.6552277271, "val/loss_avg_len_2048": 3.485983414201392, "val/perplexity_len_2048": 32.654524247766986, "val/loss_avg_len_1024": 3.558595063583925, "val/perplexity_len_1024": 35.113829776059404, "val/loss_avg_len_512": 3.6589644041204825, "val/perplexity_len_512": 38.82111905618587} +{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 10760.646769667044, "val/train_update_time": 5872.135236821254, "val/loss": 3.4716991148468344, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 227.81734533095732, "val/val_tokens_per_second": 179793.1581570145, "val/loss_avg_len_2048": 3.4716991148468344, "val/perplexity_len_2048": 32.191392876501965, "val/loss_avg_len_1024": 3.5444762171712245, "val/perplexity_len_1024": 34.6215464207269, "val/loss_avg_len_512": 3.645251961359289, "val/perplexity_len_512": 38.29241984632693} +{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 11310.365665178979, "val/train_update_time": 6193.65441437799, "val/loss": 3.458023446854088, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 238.1312235830119, "val/val_tokens_per_second": 172006.00317631784, "val/loss_avg_len_2048": 3.458023446854088, "val/perplexity_len_2048": 31.75415067939918, "val/loss_avg_len_1024": 3.531052290999843, "val/perplexity_len_1024": 34.15989485839664, "val/loss_avg_len_512": 3.63209585030973, "val/perplexity_len_512": 37.791939921908} +{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 11914.080722624087, "val/train_update_time": 6556.690899457899, "val/loss": 3.4443763262484692, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.78844607900828, "val/val_tokens_per_second": 165972.1135684238, "val/loss_avg_len_2048": 3.4443763262484692, "val/perplexity_len_2048": 31.323741558011843, "val/loss_avg_len_1024": 3.5173761587224903, "val/perplexity_len_1024": 33.69589967943844, "val/loss_avg_len_512": 3.618587289446686, "val/perplexity_len_512": 37.28485788666368} +{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 12483.778021455975, "val/train_update_time": 6879.228759005899, "val/loss": 3.4335085181755947, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 220.31813231599517, "val/val_tokens_per_second": 185912.97760845392, "val/loss_avg_len_2048": 3.4335085181755947, "val/perplexity_len_2048": 30.98516427551629, "val/loss_avg_len_1024": 3.5070164691339714, "val/perplexity_len_1024": 33.34862256580393, "val/loss_avg_len_512": 3.6083294949505476, "val/perplexity_len_512": 36.90435238672373} +{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 13068.73947558098, "val/train_update_time": 7242.851195934112, "val/loss": 3.4239009072753603, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 245.130507607013, "val/val_tokens_per_second": 167094.6647965419, "val/loss_avg_len_2048": 3.4239009072753603, "val/perplexity_len_2048": 30.68889636581984, "val/loss_avg_len_1024": 3.4978476569967345, "val/perplexity_len_1024": 33.044252797363725, "val/loss_avg_len_512": 3.5992340820400974, "val/perplexity_len_512": 36.570213930693726} +{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 13651.783316303976, "val/train_update_time": 7580.372131183161, "val/loss": 3.4135220782420834, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 205.47673456801567, "val/val_tokens_per_second": 199341.30297579587, "val/loss_avg_len_2048": 3.4135220782420834, "val/perplexity_len_2048": 30.372028758989988, "val/loss_avg_len_1024": 3.4871821906892118, "val/perplexity_len_1024": 32.69369319637278, "val/loss_avg_len_512": 3.5891118032899687, "val/perplexity_len_512": 36.201907227628595} +{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 14221.494554364006, "val/train_update_time": 7943.600337238051, "val/loss": 3.40461270494815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.15931547700893, "val/val_tokens_per_second": 166396.30281969009, "val/loss_avg_len_2048": 3.40461270494815, "val/perplexity_len_2048": 30.102634864436293, "val/loss_avg_len_1024": 3.4784065833980686, "val/perplexity_len_1024": 32.40804140117123, "val/loss_avg_len_512": 3.5805338707463816, "val/perplexity_len_512": 35.89269779430199} +{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 14821.623726570979, "val/train_update_time": 8295.638736715424, "val/loss": 3.3961672243999317, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 194.16551748092752, "val/val_tokens_per_second": 210954.0382422611, "val/loss_avg_len_2048": 3.3961672243999317, "val/perplexity_len_2048": 29.849474183786675, "val/loss_avg_len_1024": 3.469946994170826, "val/perplexity_len_1024": 32.1350390552181, "val/loss_avg_len_512": 3.5720793008117004, "val/perplexity_len_512": 35.590519663690706} +{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 15380.238736488041, "val/train_update_time": 8659.387788099353, "val/loss": 3.3896002479799794, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.37208192201797, "val/val_tokens_per_second": 166252.60330009597, "val/loss_avg_len_2048": 3.3896002479799794, "val/perplexity_len_2048": 29.654095616041587, "val/loss_avg_len_1024": 3.463640005337913, "val/perplexity_len_1024": 31.933001516723323, "val/loss_avg_len_512": 3.5656460781862034, "val/perplexity_len_512": 35.36229283146993} +{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 15988.252661015023, "val/train_update_time": 9020.618778260308, "val/loss": 3.3830379756949847, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 186.93504923197906, "val/val_tokens_per_second": 219113.5379281937, "val/loss_avg_len_2048": 3.3830379756949847, "val/perplexity_len_2048": 29.46013447520526, "val/loss_avg_len_1024": 3.4570769355883826, "val/perplexity_len_1024": 31.724109237572595, "val/loss_avg_len_512": 3.5594564177378083, "val/perplexity_len_512": 35.144088248955605} +{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 16539.33981846203, "val/train_update_time": 9384.053210339276, "val/loss": 3.377662815920031, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.44063618697692, "val/val_tokens_per_second": 166206.35554975295, "val/loss_avg_len_2048": 3.377662815920031, "val/perplexity_len_2048": 29.302206370051053, "val/loss_avg_len_1024": 3.451863897304051, "val/perplexity_len_1024": 31.55916055702456, "val/loss_avg_len_512": 3.5541689533909784, "val/perplexity_len_512": 34.95875553717333} +{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 17149.437690322986, "val/train_update_time": 9747.28814456216, "val/loss": 3.3723305154115426, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 191.12918155093212, "val/val_tokens_per_second": 214305.3178359631, "val/loss_avg_len_2048": 3.3723305154115426, "val/perplexity_len_2048": 29.14637404176299, "val/loss_avg_len_1024": 3.446441471570078, "val/perplexity_len_1024": 31.388496477491135, "val/loss_avg_len_512": 3.548855001350306, "val/perplexity_len_512": 34.773479097890984} +{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 17696.358517175075, "val/train_update_time": 10101.101109507843, "val/loss": 3.368293362279772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.3307274680119, "val/val_tokens_per_second": 166280.5140918483, "val/loss_avg_len_2048": 3.368293362279772, "val/perplexity_len_2048": 29.02894286982928, "val/loss_avg_len_1024": 3.4423994922754817, "val/perplexity_len_1024": 31.26188088582337, "val/loss_avg_len_512": 3.5450093415079635, "val/perplexity_len_512": 34.64000893066873} +{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 18306.998820198, "val/train_update_time": 10465.017247352516, "val/loss": 3.3645532914318377, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 201.65959867101628, "val/val_tokens_per_second": 203114.55675770427, "val/loss_avg_len_2048": 3.3645532914318377, "val/perplexity_len_2048": 28.920575344288608, "val/loss_avg_len_1024": 3.438682607750362, "val/perplexity_len_1024": 31.14589976227984, "val/loss_avg_len_512": 3.541240479665343, "val/perplexity_len_512": 34.50970123288121} +{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 18853.344159234082, "val/train_update_time": 10809.319413250545, "val/loss": 3.361436649635225, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 244.70953505299985, "val/val_tokens_per_second": 167382.1168886156, "val/loss_avg_len_2048": 3.361436649635225, "val/perplexity_len_2048": 28.830580583950955, "val/loss_avg_len_1024": 3.43572733581001, "val/perplexity_len_1024": 31.05399103319084, "val/loss_avg_len_512": 3.538355437440705, "val/perplexity_len_512": 34.410282769855435} +{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 19462.212121990975, "val/train_update_time": 11172.7729124584, "val/loss": 3.3589920665140265, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 212.01162798295263, "val/val_tokens_per_second": 193196.95051487224, "val/loss_avg_len_2048": 3.3589920665140265, "val/perplexity_len_2048": 28.76018790870942, "val/loss_avg_len_1024": 3.4332265336758923, "val/perplexity_len_1024": 30.976428171250745, "val/loss_avg_len_512": 3.5358822584063745, "val/perplexity_len_512": 34.32528513045831} +{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 20004.831071814988, "val/train_update_time": 11502.97223422851, "val/loss": 3.356962317546993, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 246.35607897106092, "val/val_tokens_per_second": 166263.40283980372, "val/loss_avg_len_2048": 3.356962317546993, "val/perplexity_len_2048": 28.70187115122067, "val/loss_avg_len_1024": 3.431286059721187, "val/perplexity_len_1024": 30.91637750139814, "val/loss_avg_len_512": 3.533992369709909, "val/perplexity_len_512": 34.26047542294408} +{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 20616.578735099058, "val/train_update_time": 11866.318235529237, "val/loss": 3.3554599901208655, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 228.49143660697155, "val/val_tokens_per_second": 179262.7356554082, "val/loss_avg_len_2048": 3.3554599901208655, "val/perplexity_len_2048": 28.65878391668004, "val/loss_avg_len_1024": 3.429749476152519, "val/perplexity_len_1024": 30.868908383200168, "val/loss_avg_len_512": 3.5324522381311283, "val/perplexity_len_512": 34.20775039500344} +{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 21166.986645585042, "val/train_update_time": 12187.849889185163, "val/loss": 3.354476233608229, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 236.16754041495733, "val/val_tokens_per_second": 173436.19672725294, "val/loss_avg_len_2048": 3.354476233608229, "val/perplexity_len_2048": 28.630604514465613, "val/loss_avg_len_1024": 3.4288002763013354, "val/perplexity_len_1024": 30.839621521697836, "val/loss_avg_len_512": 3.531493121845834, "val/perplexity_len_512": 34.174956913411336} +{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 21767.68956994405, "val/train_update_time": 12551.347988351132, "val/loss": 3.3539066611163086, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 244.70243748801295, "val/val_tokens_per_second": 167386.97178693398, "val/loss_avg_len_2048": 3.3539066611163086, "val/perplexity_len_2048": 28.614301952893154, "val/loss_avg_len_1024": 3.4281986616883895, "val/perplexity_len_1024": 30.82107353466212, "val/loss_avg_len_512": 3.530901245925203, "val/perplexity_len_512": 34.154735564180854} +{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 22335.74923630804, "val/train_update_time": 12874.329680579598, "val/loss": 3.353608620805643, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 220.12170270108618, "val/val_tokens_per_second": 186078.880443795, "val/loss_avg_len_2048": 3.353608620805643, "val/perplexity_len_2048": 28.60577500819937, "val/loss_avg_len_1024": 3.4279116332200825, "val/perplexity_len_1024": 30.812228278614764, "val/loss_avg_len_512": 3.530627423015889, "val/perplexity_len_512": 34.14538449544867} diff --git a/metrics/jsonlines/val_data_info.jsonl b/metrics/jsonlines/val_data_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d2a41d2d94f5b2005b74e9163cd291dacf51e5d --- /dev/null +++ b/metrics/jsonlines/val_data_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "val_data_info/vocab_size": 50277, "val_data_info/global_tokens_per_batch": 2048, "val_data_info/local_tokens_per_batch": 2048, "val_data_info/batch_len": 2048, "val_data_info/seq_len": 2048, "val_data_info/total_tokens": 2147483648, "val_data_info/global_batch_size": 1, "val_data_info/local_batch_size": 1} diff --git a/metrics/npz/train_eval/step-000000104857600.npz b/metrics/npz/train_eval/step-000000104857600.npz new file mode 100644 index 0000000000000000000000000000000000000000..9378bb7fb8bad20cba5c2e46f5c7bbb612e7eb22 --- /dev/null +++ b/metrics/npz/train_eval/step-000000104857600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a2df6c4524c1fc852a0dcd21cc06ca765bbad8dfbe52ec454576a084431a9c +size 20540 diff --git a/metrics/npz/train_eval/step-000000209715200.npz b/metrics/npz/train_eval/step-000000209715200.npz new file mode 100644 index 0000000000000000000000000000000000000000..4712d9ca2be4e327025acbab5dfd10fb66a33dda --- /dev/null +++ b/metrics/npz/train_eval/step-000000209715200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2de3934e9d3406a693ace41b171e525920e60d81785017ae65227ecb612cdee8 +size 20540 diff --git a/metrics/npz/train_eval/step-000000314572800.npz b/metrics/npz/train_eval/step-000000314572800.npz new file mode 100644 index 0000000000000000000000000000000000000000..376821a70fbb762452368e88d03a255689944589 --- /dev/null +++ b/metrics/npz/train_eval/step-000000314572800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e74318a661b5b35a5ad6f3db3c67286b572e63eb79d232b173fe64c2cbb1ff2 +size 20540 diff --git a/metrics/npz/train_eval/step-000000419430400.npz b/metrics/npz/train_eval/step-000000419430400.npz new file mode 100644 index 0000000000000000000000000000000000000000..44796f07bfe3a4af8aa9f6b5bfa4c4ffe9bd8d44 --- /dev/null +++ b/metrics/npz/train_eval/step-000000419430400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0be3a9cc277b3d750fdc918e0ffcdc5a7211d104c884ba950832917cb941f19 +size 20540 diff --git a/metrics/npz/train_eval/step-000000524288000.npz b/metrics/npz/train_eval/step-000000524288000.npz new file mode 100644 index 0000000000000000000000000000000000000000..04de9e3608cf3f9d8e38e77f86d64d0a8e0ede20 --- /dev/null +++ b/metrics/npz/train_eval/step-000000524288000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84fe65aa85da98eb6a22324452eda47230962c35d352a5fa99db91c06e48594b +size 20540 diff --git a/metrics/npz/train_eval/step-000000629145600.npz b/metrics/npz/train_eval/step-000000629145600.npz new file mode 100644 index 0000000000000000000000000000000000000000..b224e456ee8749a1c996d5f71fa4a26c02a76331 --- /dev/null +++ b/metrics/npz/train_eval/step-000000629145600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b055fa42dfa07ef66863077db67bd10e52c0fc34016c5e7545b9ba41c98f7f1 +size 20540 diff --git a/metrics/npz/train_eval/step-000000734003200.npz b/metrics/npz/train_eval/step-000000734003200.npz new file mode 100644 index 0000000000000000000000000000000000000000..1c3e3285c29face2dbf5979a5bbe182b28c15b16 --- /dev/null +++ b/metrics/npz/train_eval/step-000000734003200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b6abe3fd355d549ee5fe46e3eeaede9d84fff1f7c55155139fd08ed6bf6723 +size 20540 diff --git a/metrics/npz/train_eval/step-000000838860800.npz b/metrics/npz/train_eval/step-000000838860800.npz new file mode 100644 index 0000000000000000000000000000000000000000..5b3b0c6f706a0965d367cf844ea46ea16ff851f6 --- /dev/null +++ b/metrics/npz/train_eval/step-000000838860800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac4c3e3f598fb5c1343e57e48d6d5f264b3ae145517363a12c561cd254d0002 +size 20540 diff --git a/metrics/npz/train_eval/step-000000943718400.npz b/metrics/npz/train_eval/step-000000943718400.npz new file mode 100644 index 0000000000000000000000000000000000000000..43896b2ac423ce7e9a64d8abd2c495d939c4371d --- /dev/null +++ b/metrics/npz/train_eval/step-000000943718400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0f090899a4cf090909837ba778d0ec7ad67c71de31d8c2cf91715d2b477338f +size 20540 diff --git a/metrics/npz/train_eval/step-000001048576000.npz b/metrics/npz/train_eval/step-000001048576000.npz new file mode 100644 index 0000000000000000000000000000000000000000..c6c1ea4206bd95508a0aca4379fbe364edbe097c --- /dev/null +++ b/metrics/npz/train_eval/step-000001048576000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:945ebe7c70a99dd85eaa71bb95c9e93120dbcefbc495ca0b7e85a86d5d0fff86 +size 20540 diff --git a/metrics/npz/train_eval/step-000001153433600.npz b/metrics/npz/train_eval/step-000001153433600.npz new file mode 100644 index 0000000000000000000000000000000000000000..05e27b06fd99e33355564a9ca6e014c8cb8915dd --- /dev/null +++ b/metrics/npz/train_eval/step-000001153433600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1dfa61eb526b6e5e0f0282e3102e3c1ce97327652921e63f60ce18ce52515b8 +size 20540 diff --git a/metrics/npz/train_eval/step-000001258291200.npz b/metrics/npz/train_eval/step-000001258291200.npz new file mode 100644 index 0000000000000000000000000000000000000000..f468d25240d75214b7aab0f4c4b071ba60aabf70 --- /dev/null +++ b/metrics/npz/train_eval/step-000001258291200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0b2969a9d193d327f335461bbbf312662bc52d4e1003cc560c96a000b110f9b +size 20540 diff --git a/metrics/npz/train_eval/step-000001363148800.npz b/metrics/npz/train_eval/step-000001363148800.npz new file mode 100644 index 0000000000000000000000000000000000000000..802fc17b23db10191a42dc36d7509a23df6c7d86 --- /dev/null +++ b/metrics/npz/train_eval/step-000001363148800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bee4807c19f76accf3c5cb9f38c18a8019485c9c782fed7c40694451ad7d1ac +size 20540 diff --git a/metrics/npz/train_eval/step-000001468006400.npz b/metrics/npz/train_eval/step-000001468006400.npz new file mode 100644 index 0000000000000000000000000000000000000000..a5a54d991750fca8497f92f04ab37ad6a72b2ea6 --- /dev/null +++ b/metrics/npz/train_eval/step-000001468006400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d518154b8ff6ab23aaeb7728505801a7e8f9a35f20ff767a8b06efa8bc018b8 +size 20540 diff --git a/metrics/npz/train_eval/step-000001572864000.npz b/metrics/npz/train_eval/step-000001572864000.npz new file mode 100644 index 0000000000000000000000000000000000000000..4faeef7bc2e57345f1deb4d31c0891408f5b5c48 --- /dev/null +++ b/metrics/npz/train_eval/step-000001572864000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bdef9179e935a275b64d0f420640bfae87caad82d53d6c5ea57d766176aaae9 +size 20540 diff --git a/metrics/npz/train_eval/step-000001677721600.npz b/metrics/npz/train_eval/step-000001677721600.npz new file mode 100644 index 0000000000000000000000000000000000000000..46cb4e7de1ee004279c283f164b869a202c78cea --- /dev/null +++ b/metrics/npz/train_eval/step-000001677721600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0795ca28766da8836ea6789d7f5e21c1a22705e767eca0f5e8e800f3eb5f4c7b +size 20540 diff --git a/metrics/npz/train_eval/step-000001782579200.npz b/metrics/npz/train_eval/step-000001782579200.npz new file mode 100644 index 0000000000000000000000000000000000000000..1adbf918d5215f0e888c6fd2922594548e0c3d82 --- /dev/null +++ b/metrics/npz/train_eval/step-000001782579200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7927cbfbfe86aef4ffa92d7b9f31ee75e4b42ce12c162c1df659d4cc5975068 +size 20540 diff --git a/metrics/npz/train_eval/step-000001887436800.npz b/metrics/npz/train_eval/step-000001887436800.npz new file mode 100644 index 0000000000000000000000000000000000000000..f6f427296aa54b65389dad3305391527a03a88ad --- /dev/null +++ b/metrics/npz/train_eval/step-000001887436800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5eadbb20a8dd81449d5395a183505bbcc27bb1f66577bd331fa5f57cc1e019b +size 20540 diff --git a/metrics/npz/train_eval/step-000001992294400.npz b/metrics/npz/train_eval/step-000001992294400.npz new file mode 100644 index 0000000000000000000000000000000000000000..4f7eec445fca41a5c6e214601eba045c9cc34c3a --- /dev/null +++ b/metrics/npz/train_eval/step-000001992294400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e290ff9097ff5fb8290f6c365dfb640d4167de77a293d3bc818522532b452f +size 20540 diff --git a/metrics/npz/val/step-000000041943040.npz b/metrics/npz/val/step-000000041943040.npz new file mode 100644 index 0000000000000000000000000000000000000000..0aac377d010ef04c30a3238b7131a9acf58ba10d --- /dev/null +++ b/metrics/npz/val/step-000000041943040.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1afba005fa0e131f389ffa8765630ac71fa5e158078a5dab81f9c8444f1fdcc +size 21142 diff --git a/metrics/npz/val/step-000000083886080.npz b/metrics/npz/val/step-000000083886080.npz new file mode 100644 index 0000000000000000000000000000000000000000..02ae03a23913d8c1e90ef62ff8a7ef0958dd1641 --- /dev/null +++ b/metrics/npz/val/step-000000083886080.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a3d39080ca52784b55636a30f9365eb8bd5aa4de5cacf75403e7824cdec0b9f +size 21142 diff --git a/metrics/npz/val/step-000000125829120.npz b/metrics/npz/val/step-000000125829120.npz new file mode 100644 index 0000000000000000000000000000000000000000..91efc4e22c91b5034ed217402c2f101d9e0e82a1 --- /dev/null +++ b/metrics/npz/val/step-000000125829120.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69ffb91fd7f24e2de4e79dbaa3c28a8e31fe30be33c6c6c292237122a03d2a77 +size 21142 diff --git a/metrics/npz/val/step-000000167772160.npz b/metrics/npz/val/step-000000167772160.npz new file mode 100644 index 0000000000000000000000000000000000000000..9594d711728f7c32c6b59410ce695d67002effc0 --- /dev/null +++ b/metrics/npz/val/step-000000167772160.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0330c1a7f27095082f576326cb03bffc349c4014e608a8ed37fc9eda44b32b0 +size 21142 diff --git a/metrics/npz/val/step-000000209715200.npz b/metrics/npz/val/step-000000209715200.npz new file mode 100644 index 0000000000000000000000000000000000000000..1fc850da2f16541257b7d3af4f87e7fd02766146 --- /dev/null +++ b/metrics/npz/val/step-000000209715200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e16eb39d2f4a0af8d04012f13ade18f75a300ba3f06bf5c09c26592937579d8 +size 21142 diff --git a/metrics/npz/val/step-000000251658240.npz b/metrics/npz/val/step-000000251658240.npz new file mode 100644 index 0000000000000000000000000000000000000000..03014a78befc27bd88eb82f825cea40e39f9dcd0 --- /dev/null +++ b/metrics/npz/val/step-000000251658240.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff4cae00fcefca0c3fe1c367653ac57edcb2172befceddeef6a914e90277e48 +size 21142 diff --git a/metrics/npz/val/step-000000293601280.npz b/metrics/npz/val/step-000000293601280.npz new file mode 100644 index 0000000000000000000000000000000000000000..36785ad60524883021eefb6b46e832d1c61887d3 --- /dev/null +++ b/metrics/npz/val/step-000000293601280.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32b70a0c72e3b565cd6d59c6d876713d1dd6c2530ed53588f3b3008384fb44e6 +size 21142 diff --git a/metrics/npz/val/step-000000335544320.npz b/metrics/npz/val/step-000000335544320.npz new file mode 100644 index 0000000000000000000000000000000000000000..25fc19273fef82c632161bc925de4e1677b62db7 --- /dev/null +++ b/metrics/npz/val/step-000000335544320.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efefd618dfffff44754a8105027ef282466b996d02083333c08fab8f0536bf27 +size 21142 diff --git a/metrics/npz/val/step-000000377487360.npz b/metrics/npz/val/step-000000377487360.npz new file mode 100644 index 0000000000000000000000000000000000000000..34386804d71bec8edd2f891ccc2ebcfaf859ddd0 --- /dev/null +++ b/metrics/npz/val/step-000000377487360.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ba680d799647446f4a0382ffcc26d0e2afc3554414f72c993bbba65f1133fc +size 21142 diff --git a/metrics/npz/val/step-000000419430400.npz b/metrics/npz/val/step-000000419430400.npz new file mode 100644 index 0000000000000000000000000000000000000000..14248e10b563aab5c0ff1ff3666fe69545c2b691 --- /dev/null +++ b/metrics/npz/val/step-000000419430400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d02544a609f2b43afea5ca41d7d4c246fc8cee881a6d8888cf822849a89d794f +size 21142 diff --git a/metrics/npz/val/step-000000461373440.npz b/metrics/npz/val/step-000000461373440.npz new file mode 100644 index 0000000000000000000000000000000000000000..aec2fe0a77cdb9c244846d87c52043535215619e --- /dev/null +++ b/metrics/npz/val/step-000000461373440.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4afcfce8e37b2e0d4d3fccc7f69547ac7cee4f8e045f75dac56844b989126cfe +size 21142 diff --git a/metrics/npz/val/step-000000503316480.npz b/metrics/npz/val/step-000000503316480.npz new file mode 100644 index 0000000000000000000000000000000000000000..eab6b105554653c3991de917e605e783ea5aa662 --- /dev/null +++ b/metrics/npz/val/step-000000503316480.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430f2fccc94969caa9791fe71f822bd9646aaec96baab7c2d57b2104b64bb54a +size 21142 diff --git a/metrics/npz/val/step-000000545259520.npz b/metrics/npz/val/step-000000545259520.npz new file mode 100644 index 0000000000000000000000000000000000000000..512351cd85ab9c13cf3f30445636d3e844c02cc1 --- /dev/null +++ b/metrics/npz/val/step-000000545259520.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc962b27c69c9e71d3a01da262f074fe3d6b95a88b54baf0825d944183ab2fdb +size 21142 diff --git a/metrics/npz/val/step-000000587202560.npz b/metrics/npz/val/step-000000587202560.npz new file mode 100644 index 0000000000000000000000000000000000000000..ac83ff5841fec0a1bd1bb449244b98e3a7b78c11 --- /dev/null +++ b/metrics/npz/val/step-000000587202560.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6976d451a0d884c2df13b69c4b587618be53608d0f9ea5ce115963028a5c61d9 +size 21142 diff --git a/metrics/npz/val/step-000000629145600.npz b/metrics/npz/val/step-000000629145600.npz new file mode 100644 index 0000000000000000000000000000000000000000..4d49d41836d803a8e77a97f4a7b27d7c599a8fae --- /dev/null +++ b/metrics/npz/val/step-000000629145600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b21a5ff328593c11c42abb4fd8a9a513a2f15505df4ff75c128ab220a285cc0 +size 21142 diff --git a/metrics/npz/val/step-000000671088640.npz b/metrics/npz/val/step-000000671088640.npz new file mode 100644 index 0000000000000000000000000000000000000000..57d3bd98974d5b2d833ca0feb8315b9fe34ace8f --- /dev/null +++ b/metrics/npz/val/step-000000671088640.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a638dea38c9f5bdd140259a58155fe1fe904d0f5a11bbd7bd1c1c8a3ce54202a +size 21142 diff --git a/metrics/npz/val/step-000000713031680.npz b/metrics/npz/val/step-000000713031680.npz new file mode 100644 index 0000000000000000000000000000000000000000..6e39bd6b9790a8419a22c7fc29ec81f80f7fd78f --- /dev/null +++ b/metrics/npz/val/step-000000713031680.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cebc61653de9015d9b7fdf7f7fdc6de072d4f50962b381bf907310c018b4f8f +size 21142 diff --git a/metrics/npz/val/step-000000754974720.npz b/metrics/npz/val/step-000000754974720.npz new file mode 100644 index 0000000000000000000000000000000000000000..8285418ab9deb4c60ce052985e218ea4283fbc37 --- /dev/null +++ b/metrics/npz/val/step-000000754974720.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd7a2d228d9192175a3741b5bef67d062074396808d12ed86dcfa2eadf747963 +size 21142 diff --git a/metrics/npz/val/step-000000796917760.npz b/metrics/npz/val/step-000000796917760.npz new file mode 100644 index 0000000000000000000000000000000000000000..566736d20f4e906b917cd83fc03f0a51a32cb7cd --- /dev/null +++ b/metrics/npz/val/step-000000796917760.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7f406913ba8f73543b3d54fcd116a8e6b217a0f2090cabe27ea0d8405b848f +size 21142 diff --git a/metrics/npz/val/step-000000838860800.npz b/metrics/npz/val/step-000000838860800.npz new file mode 100644 index 0000000000000000000000000000000000000000..91e9a66b1fa7cbadf49971f6a7902d2ffe3b210f --- /dev/null +++ b/metrics/npz/val/step-000000838860800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46ec0f533f3cce0aeda0d3f7bbbf1e13c80865c006dba39d34a32d533979166 +size 21142 diff --git a/metrics/npz/val/step-000000880803840.npz b/metrics/npz/val/step-000000880803840.npz new file mode 100644 index 0000000000000000000000000000000000000000..e97d7fcd518e4e4e87320e858821ff12df34644d --- /dev/null +++ b/metrics/npz/val/step-000000880803840.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b421f8da8317fcc3648c3ce66129b0ed4d4060ec43357989ef9fbfffc166346a +size 21142 diff --git a/metrics/npz/val/step-000000922746880.npz b/metrics/npz/val/step-000000922746880.npz new file mode 100644 index 0000000000000000000000000000000000000000..11935847f7594d54efa1e30fc08e2b106635b86b --- /dev/null +++ b/metrics/npz/val/step-000000922746880.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86f483ffe3247f40dc0bca1dbd4ddd8e2e1914efb0c1505ae54ff00d71a5bf6 +size 21142 diff --git a/metrics/npz/val/step-000000964689920.npz b/metrics/npz/val/step-000000964689920.npz new file mode 100644 index 0000000000000000000000000000000000000000..80186a16e0a95eb84d1a9b7b4d762704634684f5 --- /dev/null +++ b/metrics/npz/val/step-000000964689920.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75c8ad85b6ebfe0f6b699c2daba66792a253c6d19ced764584f53ba4e1c5638f +size 21142 diff --git a/metrics/npz/val/step-000001006632960.npz b/metrics/npz/val/step-000001006632960.npz new file mode 100644 index 0000000000000000000000000000000000000000..4603d008996a622f728adbda247ed4565596f889 --- /dev/null +++ b/metrics/npz/val/step-000001006632960.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dea62372974225674cf4acd898618b0d61c5ef3c82f5fafbbb67c3946c33a87 +size 21142 diff --git a/metrics/npz/val/step-000001048576000.npz b/metrics/npz/val/step-000001048576000.npz new file mode 100644 index 0000000000000000000000000000000000000000..eec3b7b7e338f302f7780ce202e50b408a8fc175 --- /dev/null +++ b/metrics/npz/val/step-000001048576000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dacca53111e6fe81b4b62881355cc7768e02ba562f7b0cc4405e0260834a29f +size 21142 diff --git a/metrics/npz/val/step-000001090519040.npz b/metrics/npz/val/step-000001090519040.npz new file mode 100644 index 0000000000000000000000000000000000000000..86db18e679c83b741274fb9c582639ad73237aac --- /dev/null +++ b/metrics/npz/val/step-000001090519040.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58263b65ee8d6db05dc4b2bce8a0ff5c35722f1f05940479c799414dbeff9fe1 +size 21142 diff --git a/metrics/npz/val/step-000001132462080.npz b/metrics/npz/val/step-000001132462080.npz new file mode 100644 index 0000000000000000000000000000000000000000..c313f4aa6310e5f186f0896e09c043f488bac55c --- /dev/null +++ b/metrics/npz/val/step-000001132462080.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1434d16c2eea1497a401b18d9222999b2d36bfa3f4b0aaa2abf7b4e86e9dec8 +size 21142 diff --git a/metrics/npz/val/step-000001174405120.npz b/metrics/npz/val/step-000001174405120.npz new file mode 100644 index 0000000000000000000000000000000000000000..41ab819c5ef054c6a2c305be8bb40c1e2f0d814c --- /dev/null +++ b/metrics/npz/val/step-000001174405120.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03c3f9298d86c55f8ed6160afedb28aa19292377f44db058efeafff81e459114 +size 21142 diff --git a/metrics/npz/val/step-000001216348160.npz b/metrics/npz/val/step-000001216348160.npz new file mode 100644 index 0000000000000000000000000000000000000000..38230976ea9eb46497cdbc1fb92812b545d53f53 --- /dev/null +++ b/metrics/npz/val/step-000001216348160.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9995839db949d64ebe19a0986a75b82594ccd1d9bf575265cd5e75130339f89 +size 21142 diff --git a/metrics/npz/val/step-000001258291200.npz b/metrics/npz/val/step-000001258291200.npz new file mode 100644 index 0000000000000000000000000000000000000000..510f99ed2c136ebbee3b4ae19b32a1b6337d238f --- /dev/null +++ b/metrics/npz/val/step-000001258291200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:821b1c06264ee73594c457894692b8d348043b9990c22b55fd5510b86c9838a3 +size 21142 diff --git a/metrics/npz/val/step-000001300234240.npz b/metrics/npz/val/step-000001300234240.npz new file mode 100644 index 0000000000000000000000000000000000000000..f31653fa890afeb4ccfa19b9265c60f732077c4a --- /dev/null +++ b/metrics/npz/val/step-000001300234240.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d93c9ca168eaa614829a1168cfbddd970e0aea1ae99ea09c21a6ff57d74ce026 +size 21142 diff --git a/metrics/npz/val/step-000001342177280.npz b/metrics/npz/val/step-000001342177280.npz new file mode 100644 index 0000000000000000000000000000000000000000..bc68b876b4f1eb3e98753303fed1c3c1bce9e839 --- /dev/null +++ b/metrics/npz/val/step-000001342177280.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6061552495c7a57cbb0edc34787099504c382c992c835be645835a70c989d867 +size 21142 diff --git a/metrics/npz/val/step-000001384120320.npz b/metrics/npz/val/step-000001384120320.npz new file mode 100644 index 0000000000000000000000000000000000000000..ddc5abc9bd5be253ba4b5db264e580a31e7caea6 --- /dev/null +++ b/metrics/npz/val/step-000001384120320.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5154aabfcafe84c5170220cc5795ce67c92609da8ccb4c934b2d3dd8efbdeea8 +size 21142 diff --git a/metrics/npz/val/step-000001426063360.npz b/metrics/npz/val/step-000001426063360.npz new file mode 100644 index 0000000000000000000000000000000000000000..10ad167b124e5c71c5069fcaebde9dcb1a116ce8 --- /dev/null +++ b/metrics/npz/val/step-000001426063360.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26bfe74a8614d9c57d31f726df7f55c6e4d104f7fb6cd86397e405d6a847b2d +size 21142 diff --git a/metrics/npz/val/step-000001468006400.npz b/metrics/npz/val/step-000001468006400.npz new file mode 100644 index 0000000000000000000000000000000000000000..dbf0a3d4a911d36fd7fe160dedbd14754a3cb8a5 --- /dev/null +++ b/metrics/npz/val/step-000001468006400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16161cfcca250c1774035315c08076d5ea63639245b2c6ed71d67130f2c42b96 +size 21142 diff --git a/metrics/npz/val/step-000001509949440.npz b/metrics/npz/val/step-000001509949440.npz new file mode 100644 index 0000000000000000000000000000000000000000..261cdf644a414e635f04bbe13650bbcf78bbd4a3 --- /dev/null +++ b/metrics/npz/val/step-000001509949440.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd4fe041c9f95cb3f13e1a42c10c8e4393c14ed470ec3a91a51dc13eb94aea6f +size 21142 diff --git a/metrics/npz/val/step-000001551892480.npz b/metrics/npz/val/step-000001551892480.npz new file mode 100644 index 0000000000000000000000000000000000000000..3cc3cf0971c5e16e182e9d900ee0d3ad0346aed6 --- /dev/null +++ b/metrics/npz/val/step-000001551892480.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e027c9ace9dd8ef4ef66ecc0786b6c08fd2b87ff1ace4a73c43e4185366dc7b8 +size 21142 diff --git a/metrics/npz/val/step-000001593835520.npz b/metrics/npz/val/step-000001593835520.npz new file mode 100644 index 0000000000000000000000000000000000000000..d0a9387b4bc0551aa342d86c095034f5f07d6e5e --- /dev/null +++ b/metrics/npz/val/step-000001593835520.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a503d154c7df05d1f829e154ebb1014a64760ccefa13be1c5b0e8e94c9791fc1 +size 21142 diff --git a/metrics/npz/val/step-000001635778560.npz b/metrics/npz/val/step-000001635778560.npz new file mode 100644 index 0000000000000000000000000000000000000000..bf3e974794e2e055c1f72ef00c3e83ede4be563f --- /dev/null +++ b/metrics/npz/val/step-000001635778560.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41aba0883ed726632d03cb32daceabdcd67e3d4cbf6f027793938c00d19786d3 +size 21142 diff --git a/metrics/npz/val/step-000001677721600.npz b/metrics/npz/val/step-000001677721600.npz new file mode 100644 index 0000000000000000000000000000000000000000..5f850e0af62b135cb09f79ff5e21b847108e50c3 --- /dev/null +++ b/metrics/npz/val/step-000001677721600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f5a6ab636c3cf38734952cffecc607cd825c976c5c947bc97775d8e0b0c9549 +size 21142 diff --git a/metrics/npz/val/step-000001719664640.npz b/metrics/npz/val/step-000001719664640.npz new file mode 100644 index 0000000000000000000000000000000000000000..93f12266176bc1d7ddcbf9e9e5561354b4aa8403 --- /dev/null +++ b/metrics/npz/val/step-000001719664640.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa97768487bffe0381d5df075405036544a37ff22ac11a6f6f584fb10d76b4ea +size 21142 diff --git a/metrics/npz/val/step-000001761607680.npz b/metrics/npz/val/step-000001761607680.npz new file mode 100644 index 0000000000000000000000000000000000000000..b246c6b2349bc527ca20939bc047ab1ec73616c3 --- /dev/null +++ b/metrics/npz/val/step-000001761607680.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa6b08a2bd8705a2f3ec289acdc5c8acb8c46733da194b1318e5bd35d92e9ebf +size 21142 diff --git a/metrics/npz/val/step-000001803550720.npz b/metrics/npz/val/step-000001803550720.npz new file mode 100644 index 0000000000000000000000000000000000000000..20e2f5ee8355cd3e2b4c2d595a87314489336801 --- /dev/null +++ b/metrics/npz/val/step-000001803550720.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3139d68625bef65e2133b233de6b8ca22bab9b8c6de73be788774830a6e2eaab +size 21142 diff --git a/metrics/npz/val/step-000001845493760.npz b/metrics/npz/val/step-000001845493760.npz new file mode 100644 index 0000000000000000000000000000000000000000..cf39a0e9a23d73c3ed6e73e9507f307a259cce38 --- /dev/null +++ b/metrics/npz/val/step-000001845493760.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cae7a23a47a0bed93f19b261d0310be4aa9a9ff43cd1b0f4b7b0986693f1b45 +size 21142 diff --git a/metrics/npz/val/step-000001887436800.npz b/metrics/npz/val/step-000001887436800.npz new file mode 100644 index 0000000000000000000000000000000000000000..fc49f51ad668545d59df53f995e205cea06d4a21 --- /dev/null +++ b/metrics/npz/val/step-000001887436800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eace69e77e3cd9a380faf48ed4960c9cb6354e4d364cfe876eb6bdecd09182e4 +size 21142 diff --git a/metrics/npz/val/step-000001929379840.npz b/metrics/npz/val/step-000001929379840.npz new file mode 100644 index 0000000000000000000000000000000000000000..96ef8d899aeb07e26f0a69ead080585a8dcf57f1 --- /dev/null +++ b/metrics/npz/val/step-000001929379840.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01edaaae081d48c0e8ece5ddd5aa63bf4c3680128694461ac99e93641ce077f6 +size 21142 diff --git a/metrics/npz/val/step-000001971322880.npz b/metrics/npz/val/step-000001971322880.npz new file mode 100644 index 0000000000000000000000000000000000000000..84d33a393abe73a1c1f63787bf52410acf038cfe --- /dev/null +++ b/metrics/npz/val/step-000001971322880.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78c2a6ed39508ff17085ec21f0a2ce427aa4bca67405f1613fe30e4aebe80a8 +size 21142 diff --git a/metrics/npz/val/step-000002013265920.npz b/metrics/npz/val/step-000002013265920.npz new file mode 100644 index 0000000000000000000000000000000000000000..37ba90a26f8f2c7b70ab885a0e1682836adf45c8 --- /dev/null +++ b/metrics/npz/val/step-000002013265920.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb85c5cae2bf7b0c4ddbbe06c0ed783885597e4cb7482cc831f500f53ebdf5e +size 21142 diff --git a/metrics/npz/val/step-000002055208960.npz b/metrics/npz/val/step-000002055208960.npz new file mode 100644 index 0000000000000000000000000000000000000000..3072714833ce98f375ff643720aea0fde2ef9fcb --- /dev/null +++ b/metrics/npz/val/step-000002055208960.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a911ff75b19dde6f41ffcb3dcf5d2e83ebf4412f2d67751ed5dfbdf940b596c +size 21142 diff --git a/metrics/wandb/wandb_run_id.txt b/metrics/wandb/wandb_run_id.txt new file mode 100644 index 0000000000000000000000000000000000000000..934705dafa2b53a7af254ab528434acf40fef854 --- /dev/null +++ b/metrics/wandb/wandb_run_id.txt @@ -0,0 +1 @@ +y8zione3 \ No newline at end of file diff --git a/model.txt b/model.txt new file mode 100644 index 0000000000000000000000000000000000000000..d89e5570b26cdd61ae40d1308f4853defb2f18b1 --- /dev/null +++ b/model.txt @@ -0,0 +1,25 @@ +ForgettingTransformerForCausalLM( + (model): ForgettingTransformerModel( + (embeddings): Embedding(50277, 512) + (layers): ModuleList( + (0-5): 6 x ForgettingTransformerBlock( + (attn_norm): RMSNorm(512, eps=1e-06) + (attn): ForgettingAttentionLayer( + (q_proj): Linear(in_features=512, out_features=512, bias=False) + (k_proj): Linear(in_features=512, out_features=512, bias=False) + (v_proj): Linear(in_features=512, out_features=512, bias=False) + (o_proj): Linear(in_features=512, out_features=512, bias=False) + (fgate_proj): Linear(in_features=512, out_features=8, bias=True) + ) + (mlp_norm): RMSNorm(512, eps=1e-06) + (mlp): ForgettingTransformerMLP( + (gate_proj): Linear(in_features=512, out_features=3072, bias=False) + (down_proj): Linear(in_features=1536, out_features=512, bias=False) + (act_fn): SiLU() + ) + ) + ) + (norm): RMSNorm(512, eps=1e-06) + ) + (lm_head): Linear(in_features=512, out_features=50277, bias=False) +) diff --git a/modeling_transformer.py b/modeling_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b70f526e9824d2cf247dcac4ce13cf288351bddb --- /dev/null +++ b/modeling_transformer.py @@ -0,0 +1,573 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import functional as F +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_outputs import (BaseModelOutputWithPast, + CausalLMOutputWithPast) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging + +# from fla.layers.attn import Attention +from fla.modules import FusedCrossEntropyLoss, RMSNorm +from fla.modules.activations import swiglu_linear + +from fla.modules import RotaryEmbedding +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import (index_first_axis, pad_input, + unpad_input) +except ImportError: + warnings.warn("Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`") + flash_attn_func = None +from einops import rearrange + +from forgetting_transformer.model.transformer.configuration_transformer import TransformerConfig + +from functools import partial + +logger = logging.get_logger(__name__) + +class Attention(nn.Module): + + def __init__( + self, + hidden_size: int = 2048, + num_heads: int = 32, + num_kv_heads: Optional[int] = None, + window_size: Optional[int] = None, + max_position_embeddings: Optional[int] = None, + rope_base: float = 500000.0, + use_rope: bool = True, + layer_idx: int = None, + ): + super().__init__() + + self.num_heads = num_heads + if num_kv_heads is None: + self.num_kv_heads = self.num_heads + else: + self.num_kv_heads = num_kv_heads + self.num_kv_groups = num_heads // self.num_kv_heads + self.hidden_size = hidden_size + self.head_dim = self.hidden_size // self.num_heads + self.kv_dim = self.num_kv_heads * self.head_dim + self.kv_dim = self.num_kv_heads * self.head_dim + self.window_size = window_size + self.max_position_embeddings = max_position_embeddings + self.layer_idx = layer_idx + + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + if use_rope: + self.rotary = RotaryEmbedding(self.head_dim, base=rope_base) + else: + self.rotary = None + + + self.apply(self._initialize_weights) + + def _initialize_weights(self, module: nn.Module): + pass + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + batch_size, q_len, _ = hidden_states.size() + q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', h=self.num_heads) + k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', h=self.num_kv_heads) + v = rearrange(self.v_proj(hidden_states), 'b t (h d) -> b h t d', h=self.num_kv_heads) + + seqlen_offset, max_seqlen = 0, q.shape[1] + if past_key_values is not None: + seqlen_offset = past_key_values.get_seq_length(self.layer_idx) + max_seqlen = q.shape[1] + seqlen_offset + + if attention_mask is not None: + # to deliminate the offsets of padding tokens + seqlen_offset = (seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]) + max_seqlen = q.shape[1] + max(seqlen_offset) + + if self.max_position_embeddings is not None: + max_seqlen = max(max_seqlen, self.max_position_embeddings) + if self.rotary is not None: + q, k = self.rotary(q, k, seqlen_offset, max_seqlen) + + k = rearrange(k, 'b t h d -> b h t d') + if past_key_values is not None: + k, v = past_key_values.update(k, v, self.layer_idx) + k, v = rearrange(k, 'b h t d -> b t h d'), rearrange(v, 'b h t d -> b t h d') + if self.num_kv_groups > 1: + k = rearrange(k.unsqueeze(-2).repeat(1, 1, 1, self.num_kv_groups, 1), 'b t h g d -> b t (h g) d') + v = rearrange(v.unsqueeze(-2).repeat(1, 1, 1, self.num_kv_groups, 1), 'b t h g d -> b t (h g) d') + + if flash_attn_func is None: + raise ImportError("Please install Flash Attention via `pip install flash-attn --no-build-isolation` first") + + # Contains at least one padding token in the sequence + if attention_mask is not None: + q, k, v, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(q, k, v, attention_mask, q_len) + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_q, max_seqlen_k = max_seq_lens + o = flash_attn_varlen_func( + q, k, v, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + causal=True, + window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0) + ) + o = pad_input(o, indices_q, batch_size, q_len) + else: + o = flash_attn_func( + q, k, v, + causal=True, + window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0) + ) + o = o.reshape(batch_size, q_len, self.hidden_size) + o = self.o_proj(o) + + if not output_attentions: + attentions = None + + return o, attentions, past_key_values + + def _upad_input(self, q, k, v, attention_mask, q_len): + seqlens = attention_mask.sum(-1, dtype=torch.int32) + indices_k = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_k = seqlens.max().item() + cu_seqlens_k = F.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0)) + batch_size, seq_len, num_key_value_heads, head_dim = k.shape + + k = index_first_axis(k.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k) + v = index_first_axis(v.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k) + if q_len == seq_len: + q = index_first_axis(q.reshape(batch_size * seq_len, self.num_heads, head_dim), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_q = max_seqlen_k + indices_q = indices_k + elif q_len == 1: + max_seqlen_q = 1 + # There is a memcpy here, that is very bad. + cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device) + indices_q = cu_seqlens_q[:-1] + q = q.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -q_len:] + q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask) + + return q, k, v, indices_q, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) + + +class TransformerMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + hidden_ratio: Optional[int] = None, + intermediate_size: Optional[int] = None, + hidden_act: str = 'swish' + ) -> TransformerMLP: + super().__init__() + + self.hidden_size = hidden_size + # the final number of params is `hidden_ratio * hidden_size^2` + # `intermediate_size` is chosen to be a multiple of 256 closest to `2/3 * hidden_size * hidden_ratio` + if hidden_ratio is None: + hidden_ratio = 4 + if intermediate_size is None: + intermediate_size = int(hidden_size * hidden_ratio * 2 / 3) + intermediate_size = 256 * ((intermediate_size + 256 - 1) // 256) + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + y = self.gate_proj(x) + gate, y = y.chunk(2, -1) + # TODO: maybe wrap swiglu_linear in custom_fwd/custom_bwd + return swiglu_linear( + gate, y, + self.down_proj.weight.to(y.dtype), + self.down_proj.bias.to(y.dtype) if self.down_proj.bias is not None else self.down_proj.bias + ) + + +class TransformerBlock(nn.Module): + def __init__(self, config: TransformerConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.attn_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps) + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + window_size=config.window_size, + max_position_embeddings=config.max_position_embeddings, + rope_base=config.rope_base, + use_rope=config.use_rope, + layer_idx=layer_idx + ) + self.mlp_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps) + self.mlp = TransformerMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act + ) + + def forward_attn( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ): + # reisual handled outside + # residual = hidden_states + hidden_states = self.attn_norm(hidden_states) + hidden_states, attentions, past_key_values = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions + ) + return hidden_states, attentions, past_key_values + + def forward_mlp( + self, + hidden_states: torch.Tensor, + residual: torch.Tensor, + ): + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + gradient_checkpointing: bool = False + # **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + + + if gradient_checkpointing: + forward_attn = partial(torch.utils.checkpoint.checkpoint, self.forward_attn, use_reentrant=False) + forward_mlp = partial(torch.utils.checkpoint.checkpoint, self.forward_mlp, use_reentrant=False) + else: + forward_attn = self.forward_attn + forward_mlp = self.forward_mlp + + hidden_states, attentions, past_key_values = forward_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions + ) + + hidden_states = forward_mlp( + hidden_states, + residual, + ) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attentions,) + + if use_cache: + outputs += (past_key_values,) + + return outputs + + + +class TransformerPreTrainedModel(PreTrainedModel): + + config_class = TransformerConfig + supports_gradient_checkpointing = True + _no_split_modules = ['TransformerBlock'] + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class TransformerModel(TransformerPreTrainedModel): + + def __init__(self, config: TransformerConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([TransformerBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]) + self.norm = RMSNorm(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None + ) -> Union[Tuple, CausalLMOutputWithPast]: + if output_attentions: + warnings.warn( + "`TransformerModel` does not support output attention weights now, so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + next_decoder_cache = None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + gradient_checkpointing=self.gradient_checkpointing and self.training + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attns] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class TransformerForCausalLM(TransformerPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = TransformerModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is passed along. + if past_key_values is not None: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': kwargs.get('use_cache'), + 'attention_mask': attention_mask, + }) + return model_inputs + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + hidden_states = outputs[0] + + loss = None + if labels is not None: + if self.config.fuse_cross_entropy: + loss_fct = FusedCrossEntropyLoss(inplace_backward=True, reduction='none') + else: + loss_fct = nn.CrossEntropyLoss(reduction='none') + logits = self.lm_head(hidden_states) + # Enable model parallelism + labels = labels.to(logits.device) + # labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], loss_fct.ignore_index)), 1) + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + loss = loss.view(*labels.size()) + del logits + logits = None + else: + logits = self.lm_head(hidden_states) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/no_decay_params.txt b/no_decay_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..a925879c8f8dc5f791eb1bc1b6f60c520a5cacab --- /dev/null +++ b/no_decay_params.txt @@ -0,0 +1,19 @@ +_forward_module.model.layers.0.attn_norm.weight +_forward_module.model.layers.0.attn.fgate_proj.bias +_forward_module.model.layers.0.mlp_norm.weight +_forward_module.model.layers.1.attn_norm.weight +_forward_module.model.layers.1.attn.fgate_proj.bias +_forward_module.model.layers.1.mlp_norm.weight +_forward_module.model.layers.2.attn_norm.weight +_forward_module.model.layers.2.attn.fgate_proj.bias +_forward_module.model.layers.2.mlp_norm.weight +_forward_module.model.layers.3.attn_norm.weight +_forward_module.model.layers.3.attn.fgate_proj.bias +_forward_module.model.layers.3.mlp_norm.weight +_forward_module.model.layers.4.attn_norm.weight +_forward_module.model.layers.4.attn.fgate_proj.bias +_forward_module.model.layers.4.mlp_norm.weight +_forward_module.model.layers.5.attn_norm.weight +_forward_module.model.layers.5.attn.fgate_proj.bias +_forward_module.model.layers.5.mlp_norm.weight +_forward_module.model.norm.weight