diff --git a/.hydra/config.yaml b/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5998500bb8ba4de3e370d4eebe7e33bfafe0cf93 --- /dev/null +++ b/.hydra/config.yaml @@ -0,0 +1,93 @@ +model: + _target_: forgetting_transformer.model.alibi.modeling_alibi.AlibiForCausalLM + config: + _target_: forgetting_transformer.model.alibi.configuration_alibi.AlibiConfig + vocab_size: ??? + hidden_size: 256 + hidden_ratio: 4 + intermediate_size: null + num_hidden_layers: 2 + num_heads: 4 + num_kv_heads: null + hidden_act: swish + window_size: null + max_position_embeddings: null + initializer_range: 0.02 + elementwise_affine: true + norm_eps: 1.0e-06 + use_cache: true + pad_token_id: null + bos_token_id: null + eos_token_id: null + tie_word_embeddings: false + attention_bias: false + fuse_norm: true + fuse_cross_entropy: true + use_rope: false + use_alibi: true +optimizer: + _target_: torch.optim.AdamW + lr: 0.001 + betas: + - 0.9 + - 0.95 + weight_decay: 0.1 +schedule: + _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule + init_value: 0.0 + peak_value: ${optimizer.lr} + warmup_steps: 20971520 + decay_steps: ${train.max_tokens} + end_value: 0.0 +datamodule: + _target_: forgetting_transformer.datamodule.npy.NpyDataModule + data_path: ${data_dir} + rank: ??? + world_size: ??? + train_batch_len: 2048 + train_batch_size: 1024 + train_num_workers: 0 + eval_tokens: 2147483648 + eval_batch_len: 2048 + eval_local_batch_size: 1 + eval_num_workers: 0 +strategy: + _target_: lightning.fabric.strategies.FSDPStrategy + state_dict_type: full + sharding_strategy: FULL_SHARD + cpu_offload: false +exp: alibi_2_4_256 +tag: alibi_2_4_256 +seed: 42 +hf_load_dir: null +hf_save_dir: null +hf_load_step: null +output_dir: ./alibi_2_4_256/ +data_dir: data +resume: false +fork_dir: null +fork_step: null +log_interval: 20971520 +eval_interval: 41943040 +final_eval: true +skip_eval: false +checkpoint_interval: 209715200 +train_eval_interval: 104857600 +checkpoint_keep_interval: 209715200 +fabric: + devices: 1 + precision: 16-mixed +train: + max_tokens: 2097152000 + grad_acc_tokens: 32768 + max_grad_norm: 1.0 + gradient_checkpointing: false + bias_weight_decay: false + normalization_weight_decay: false + conv_weight_decay: true +eval: + min_val_length: 512 +wandb: + project: forgetting-transformer + mode: online + log_dir: ./output/wandb diff --git a/.hydra/hydra.yaml b/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff946127a052ecc0a47afef4280326c18136585f --- /dev/null +++ b/.hydra/hydra.yaml @@ -0,0 +1,140 @@ +hydra: + run: + dir: ${output_dir} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + root: null + disable_existing_loggers: false + job_logging: + version: 1 + root: null + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - +experiment/pile/alibi=alibi_2_4_256 + - fabric.devices=1 + - fabric.precision=16-mixed + - seed=42 + - exp=alibi_2_4_256 + - tag=alibi_2_4_256 + - output_dir=./alibi_2_4_256/ + - wandb.log_dir=./output/wandb + - wandb.mode=online + - resume=false + job: + name: train + chdir: null + override_dirname: +experiment/pile/alibi=alibi_2_4_256,exp=alibi_2_4_256,fabric.devices=1,fabric.precision=16-mixed,output_dir=./alibi_2_4_256/,resume=false,seed=42,tag=alibi_2_4_256,wandb.log_dir=./output/wandb,wandb.mode=online + id: ??? + num: ??? + config_name: config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /workspace/forgetting-transformer + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /workspace/forgetting-transformer/configs + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /workspace/forgetting-transformer/alibi_2_4_256 + choices: + experiment/pile/alibi: alibi_2_4_256 + strategy: fsdp + datamodule: npy + schedule: warmup_cosine + optimizer: adamw + model: alibi + hydra/env: default + hydra/callbacks: null + hydra/job_logging: none + hydra/hydra_logging: none + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/.hydra/overrides.yaml b/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ebbc314e45992b41be7b4bfa51bb3bba4500ab67 --- /dev/null +++ b/.hydra/overrides.yaml @@ -0,0 +1,10 @@ +- +experiment/pile/alibi=alibi_2_4_256 +- fabric.devices=1 +- fabric.precision=16-mixed +- seed=42 +- exp=alibi_2_4_256 +- tag=alibi_2_4_256 +- output_dir=./alibi_2_4_256/ +- wandb.log_dir=./output/wandb +- wandb.mode=online +- resume=false diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7f434981dc7bf9b220ed13f2cf53f70c18da7df0 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +# for HF remote code diff --git a/__pycache__/__init__.cpython-310.pyc b/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa6fc1efccec774fae6b6dc03279f15a36467852 Binary files /dev/null and b/__pycache__/__init__.cpython-310.pyc differ diff --git a/__pycache__/configuration_transformer.cpython-310.pyc b/__pycache__/configuration_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb235c4360610154373c138bbdaa933131675b9e Binary files /dev/null and b/__pycache__/configuration_transformer.cpython-310.pyc differ diff --git a/__pycache__/modeling_transformer.cpython-310.pyc b/__pycache__/modeling_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f86d01b5bf6302ce831f5b6f7d092301d79b7fe Binary files /dev/null and b/__pycache__/modeling_transformer.cpython-310.pyc differ diff --git a/checkpoints/step-000000209715200.pt b/checkpoints/step-000000209715200.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b8d9ebcb97769f2c38def6d935c6c532903f7bf --- /dev/null +++ b/checkpoints/step-000000209715200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec850b83cd80f554cbaee361b572f4e98f46b3f38043b41115d6254c755d89b2 +size 329410370 diff --git a/checkpoints/step-000000209715200.pt.done b/checkpoints/step-000000209715200.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000209715200.pt.keep b/checkpoints/step-000000209715200.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000419430400.pt b/checkpoints/step-000000419430400.pt new file mode 100644 index 0000000000000000000000000000000000000000..eac774e3e52ab86b8847390c232a3276e9efd858 --- /dev/null +++ b/checkpoints/step-000000419430400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36d99734f17e6bd7ed7f3c6507fd9f42049ebcc506c8d2295132be0d28691596 +size 329410370 diff --git a/checkpoints/step-000000419430400.pt.done b/checkpoints/step-000000419430400.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000419430400.pt.keep b/checkpoints/step-000000419430400.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000629145600.pt b/checkpoints/step-000000629145600.pt new file mode 100644 index 0000000000000000000000000000000000000000..017fa65561705ca0a8673cc30019205b68c246a5 --- /dev/null +++ b/checkpoints/step-000000629145600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103930d0d1c0a46cfe767d05427b41c0dcda68c38c677313985b658a6072b0d3 +size 329410370 diff --git a/checkpoints/step-000000629145600.pt.done b/checkpoints/step-000000629145600.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000629145600.pt.keep b/checkpoints/step-000000629145600.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000838860800.pt b/checkpoints/step-000000838860800.pt new file mode 100644 index 0000000000000000000000000000000000000000..487f03f5a0f2f9da3c4331a6867a3cb42abb592d --- /dev/null +++ b/checkpoints/step-000000838860800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87a4d22266e6afe2151eb7f240291f97b54289f17cef162b1878fc706eb59f8b +size 329410370 diff --git a/checkpoints/step-000000838860800.pt.done b/checkpoints/step-000000838860800.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000000838860800.pt.keep b/checkpoints/step-000000838860800.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001048576000.pt b/checkpoints/step-000001048576000.pt new file mode 100644 index 0000000000000000000000000000000000000000..18f5cfd158af37cd8d0526ef07a8f7ed41a6ddd6 --- /dev/null +++ b/checkpoints/step-000001048576000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b9206d508292b688c2a10f2c37f8c7e843c2b0a3845f6ec9fe9add1e773423c +size 329410370 diff --git a/checkpoints/step-000001048576000.pt.done b/checkpoints/step-000001048576000.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001048576000.pt.keep b/checkpoints/step-000001048576000.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001258291200.pt b/checkpoints/step-000001258291200.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d10c7f67cbe86121b6367a2dda73694ef1fd71d --- /dev/null +++ b/checkpoints/step-000001258291200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:343fd80723001feb28b4fd1886c550885b02b39b3cb20dce26382e32835343f5 +size 329410370 diff --git a/checkpoints/step-000001258291200.pt.done b/checkpoints/step-000001258291200.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001258291200.pt.keep b/checkpoints/step-000001258291200.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001468006400.pt b/checkpoints/step-000001468006400.pt new file mode 100644 index 0000000000000000000000000000000000000000..b44e514d3c3052cd2bf4c4df3a68b465e4db5175 --- /dev/null +++ b/checkpoints/step-000001468006400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:097bf7b63cf0e604e2c2567d1cef7e399f58b74336a8b9f73b61d81f68d546e4 +size 329410370 diff --git a/checkpoints/step-000001468006400.pt.done b/checkpoints/step-000001468006400.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001468006400.pt.keep b/checkpoints/step-000001468006400.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001677721600.pt b/checkpoints/step-000001677721600.pt new file mode 100644 index 0000000000000000000000000000000000000000..5353e67418c1feadb6235273f5a5c53bb1a2653f --- /dev/null +++ b/checkpoints/step-000001677721600.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c1358a05b23cc9074bf3c29c10a719418029e63ca84a1661c7121aaa03e777e +size 329410370 diff --git a/checkpoints/step-000001677721600.pt.done b/checkpoints/step-000001677721600.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001677721600.pt.keep b/checkpoints/step-000001677721600.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001887436800.pt b/checkpoints/step-000001887436800.pt new file mode 100644 index 0000000000000000000000000000000000000000..977bd37d2d422f5bb9041964f1cb637293b051c8 --- /dev/null +++ b/checkpoints/step-000001887436800.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cff1b93abb6e02112836ac20d76032e9051fd95369638b35ee3aa0cf5bf8553f +size 329410370 diff --git a/checkpoints/step-000001887436800.pt.done b/checkpoints/step-000001887436800.pt.done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/step-000001887436800.pt.keep b/checkpoints/step-000001887436800.pt.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e901a048786db56e50f61bc2518284181007d412 --- /dev/null +++ b/config.yaml @@ -0,0 +1,93 @@ +model: + _target_: forgetting_transformer.model.alibi.modeling_alibi.AlibiForCausalLM + config: + _target_: forgetting_transformer.model.alibi.configuration_alibi.AlibiConfig + vocab_size: ??? + hidden_size: 256 + hidden_ratio: 4 + intermediate_size: null + num_hidden_layers: 2 + num_heads: 4 + num_kv_heads: null + hidden_act: swish + window_size: null + max_position_embeddings: null + initializer_range: 0.02 + elementwise_affine: true + norm_eps: 1.0e-06 + use_cache: true + pad_token_id: null + bos_token_id: null + eos_token_id: null + tie_word_embeddings: false + attention_bias: false + fuse_norm: true + fuse_cross_entropy: true + use_rope: false + use_alibi: true +optimizer: + _target_: torch.optim.AdamW + lr: 0.001 + betas: + - 0.9 + - 0.95 + weight_decay: 0.1 +schedule: + _target_: forgetting_transformer.schedule.warmup_cosine_decay_schedule + init_value: 0.0 + peak_value: 0.001 + warmup_steps: 20971520 + decay_steps: 2097152000 + end_value: 0.0 +datamodule: + _target_: forgetting_transformer.datamodule.npy.NpyDataModule + data_path: /workspace/forgetting-transformer/data + rank: ??? + world_size: ??? + train_batch_len: 2048 + train_batch_size: 1024 + train_num_workers: 0 + eval_tokens: 2147483648 + eval_batch_len: 2048 + eval_local_batch_size: 1 + eval_num_workers: 0 +strategy: + _target_: lightning.fabric.strategies.FSDPStrategy + state_dict_type: full + sharding_strategy: FULL_SHARD + cpu_offload: false +exp: alibi_2_4_256 +tag: alibi_2_4_256 +seed: 42 +hf_load_dir: null +hf_save_dir: null +hf_load_step: null +output_dir: /workspace/forgetting-transformer/alibi_2_4_256 +data_dir: /workspace/forgetting-transformer/data +resume: false +fork_dir: null +fork_step: null +log_interval: 20971520 +eval_interval: 41943040 +final_eval: true +skip_eval: false +checkpoint_interval: 209715200 +train_eval_interval: 104857600 +checkpoint_keep_interval: 209715200 +fabric: + devices: 1 + precision: 16-mixed +train: + max_tokens: 2097152000 + grad_acc_tokens: 32768 + max_grad_norm: 1.0 + gradient_checkpointing: false + bias_weight_decay: false + normalization_weight_decay: false + conv_weight_decay: true +eval: + min_val_length: 512 +wandb: + project: forgetting-transformer + mode: online + log_dir: ./output/wandb diff --git a/configuration_transformer.py b/configuration_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..4b6767a5dcc859f307966491b13c2e44b35d8176 --- /dev/null +++ b/configuration_transformer.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class TransformerConfig(PretrainedConfig): + + model_type = 'transformer-project_fox' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + vocab_size: int = 32000, + hidden_size: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + num_hidden_layers: int = 24, + num_heads: int = 32, + num_kv_heads: int = None, + hidden_act: str = "swish", + window_size: Optional[int] = None, + max_position_embeddings: int = 2048, + initializer_range: float = 0.02, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + attention_bias: bool = False, + fuse_norm: bool = True, + fuse_cross_entropy: bool = True, + rope_base: float = 500000.0, + use_rope: bool = True, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.window_size = window_size + self.max_position_embeddings = max_position_embeddings + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.fuse_cross_entropy = fuse_cross_entropy + self.fuse_norm = fuse_norm + self.rope_base = rope_base + self.use_rope = use_rope + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/decay_params.txt b/decay_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..594174bb9c7c453d9bfca41187ccdaf55c0f9b80 --- /dev/null +++ b/decay_params.txt @@ -0,0 +1,14 @@ +_forward_module._fsdp_wrapped_module.model.embeddings.weight +_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight +_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight +_forward_module._fsdp_wrapped_module.lm_head.weight diff --git a/logs/2025-10-28_17-33-58.log b/logs/2025-10-28_17-33-58.log new file mode 100644 index 0000000000000000000000000000000000000000..45711a2cfebb5437099f2281e5fd89e8932a8a22 --- /dev/null +++ b/logs/2025-10-28_17-33-58.log @@ -0,0 +1,258 @@ +[2025-10-28 17:33:59][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256` +[2025-10-28 17:33:59][train:375][INFO] Configuration: +[2025-10-28 17:33:59][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml. +[2025-10-28 17:33:59][train:387][INFO] creating datamodule +[2025-10-28 17:33:59][train:419][INFO] creating model +[2025-10-28 17:33:59][train:440][INFO] creating optimizer +[2025-10-28 17:33:59][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints... +[2025-10-28 17:33:59][logger:256][INFO] Setting up wandb logger... +[2025-10-28 17:33:59][logger:272][INFO] Not resuming. Creating a new wandb run. +[2025-10-28 17:34:00][logger:288][INFO] wandb initialized. Run id: lo4di2up +[2025-10-28 17:34:00][logger:186][INFO] Setting up jsonlines logger... +[2025-10-28 17:34:00][logger:113][INFO] Setting up npz logger... +[2025-10-28 17:34:00][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024] +[2025-10-28 17:34:00][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1] +[2025-10-28 17:34:00][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128] +[2025-10-28 17:35:00][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:59] [ETA: 1:38:12] [loss: 9.762] [tokens/s: 374995.302] [batches/s: 0.179] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:35:56][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:55] [ETA: 1:34:14] [loss: 8.127] [tokens/s: 375180.565] [batches/s: 0.179] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:35:56][train:194][INFO] Running validation... +[2025-10-28 17:37:27][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 115.389] [val/train_update_time: 115.085] [val/loss: 8.017] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.896] [val/val_tokens_per_second: 450625.947] [val/loss_avg_len_2048: 8.017] [val/perplexity_len_2048: 3033.046] [val/loss_avg_len_1024: 8.016] [val/perplexity_len_1024: 3029.389] [val/loss_avg_len_512: 8.017] [val/perplexity_len_512: 3030.799] +[2025-10-28 17:38:23][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:04:22] [ETA: 2:21:16] [loss: 7.520] [tokens/s: 240408.593] [batches/s: 0.115] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:39:19][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:05:18] [ETA: 2:07:12] [loss: 7.193] [tokens/s: 264837.441] [batches/s: 0.126] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:39:19][train:194][INFO] Running validation... +[2025-10-28 17:40:50][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 318.011] [val/train_update_time: 226.570] [val/loss: 7.169] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 91.534] [val/val_tokens_per_second: 447486.078] [val/loss_avg_len_2048: 7.169] [val/perplexity_len_2048: 1298.286] [val/loss_avg_len_1024: 7.169] [val/perplexity_len_1024: 1298.841] [val/loss_avg_len_512: 7.173] [val/perplexity_len_512: 1303.146] +[2025-10-28 17:41:46][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:07:45] [ETA: 2:27:22] [loss: 6.947] [tokens/s: 225245.860] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:41:46][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 465.399] [train_eval/train_update_time: 282.295] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.263] [train_eval/perplexity_len_2048: 3876.694] [train_eval/loss_avg_len_1024: 8.264] [train_eval/perplexity_len_1024: 3879.983] [train_eval/loss_avg_len_512: 8.264] [train_eval/perplexity_len_512: 3883.116] +[2025-10-28 17:42:42][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:08:41] [ETA: 2:16:06] [loss: 6.683] [tokens/s: 241633.348] [batches/s: 0.115] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:42:42][train:194][INFO] Running validation... +[2025-10-28 17:44:13][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 521.249] [val/train_update_time: 338.027] [val/loss: 6.682] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.733] [val/val_tokens_per_second: 451436.485] [val/loss_avg_len_2048: 6.682] [val/perplexity_len_2048: 797.878] [val/loss_avg_len_1024: 6.683] [val/perplexity_len_1024: 799.027] [val/loss_avg_len_512: 6.689] [val/perplexity_len_512: 803.835] +[2025-10-28 17:45:08][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:11:07] [ETA: 2:27:52] [loss: 6.482] [tokens/s: 219698.786] [batches/s: 0.105] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:46:04][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:12:03] [ETA: 2:18:42] [loss: 6.282] [tokens/s: 231876.710] [batches/s: 0.111] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:46:04][train:194][INFO] Running validation... +[2025-10-28 17:47:36][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 723.681] [val/train_update_time: 449.489] [val/loss: 6.253] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 91.526] [val/val_tokens_per_second: 447522.667] [val/loss_avg_len_2048: 6.253] [val/perplexity_len_2048: 519.566] [val/loss_avg_len_1024: 6.256] [val/perplexity_len_1024: 521.125] [val/loss_avg_len_512: 6.265] [val/perplexity_len_512: 525.826] +[2025-10-28 17:48:32][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:14:31] [ETA: 2:26:47] [loss: 6.091] [tokens/s: 216563.422] [batches/s: 0.103] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:49:28][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:15:26] [ETA: 2:19:01] [loss: 5.968] [tokens/s: 226236.908] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:49:28][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 926.886] [train_eval/train_update_time: 560.935] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.398] [train_eval/perplexity_len_2048: 600.871] [train_eval/loss_avg_len_1024: 6.403] [train_eval/perplexity_len_1024: 603.442] [train_eval/loss_avg_len_512: 6.409] [train_eval/perplexity_len_512: 607.288] +[2025-10-28 17:49:28][train:194][INFO] Running validation... +[2025-10-28 17:50:58][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 926.886] [val/train_update_time: 560.935] [val/loss: 5.955] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.689] [val/val_tokens_per_second: 451651.287] [val/loss_avg_len_2048: 5.955] [val/perplexity_len_2048: 385.761] [val/loss_avg_len_1024: 5.959] [val/perplexity_len_1024: 387.331] [val/loss_avg_len_512: 5.970] [val/perplexity_len_512: 391.625] +[2025-10-28 17:50:58][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt... +[2025-10-28 17:50:59][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt. +[2025-10-28 17:50:59][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.452] +[2025-10-28 17:51:55][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:17:53] [ETA: 2:24:48] [loss: 5.855] [tokens/s: 205815.587] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:52:50][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:18:49] [ETA: 2:18:04] [loss: 5.713] [tokens/s: 226206.211] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:52:50][train:194][INFO] Running validation... +[2025-10-28 17:54:21][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1129.715] [val/train_update_time: 672.390] [val/loss: 5.728] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.612] [val/val_tokens_per_second: 452039.342] [val/loss_avg_len_2048: 5.728] [val/perplexity_len_2048: 307.227] [val/loss_avg_len_1024: 5.733] [val/perplexity_len_1024: 308.796] [val/loss_avg_len_512: 5.745] [val/perplexity_len_512: 312.716] +[2025-10-28 17:55:17][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:21:16] [ETA: 2:22:20] [loss: 5.645] [tokens/s: 205884.255] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:56:13][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:22:12] [ETA: 2:16:22] [loss: 5.569] [tokens/s: 226441.230] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:56:13][train:194][INFO] Running validation... +[2025-10-28 17:57:44][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1332.018] [val/train_update_time: 783.855] [val/loss: 5.546] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.990] [val/val_tokens_per_second: 450157.629] [val/loss_avg_len_2048: 5.546] [val/perplexity_len_2048: 256.270] [val/loss_avg_len_1024: 5.552] [val/perplexity_len_1024: 257.854] [val/loss_avg_len_512: 5.567] [val/perplexity_len_512: 261.533] +[2025-10-28 17:58:40][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:24:38] [ETA: 2:19:40] [loss: 5.447] [tokens/s: 205994.060] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:58:40][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1478.875] [train_eval/train_update_time: 839.598] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.694] [train_eval/perplexity_len_2048: 297.217] [train_eval/loss_avg_len_1024: 5.700] [train_eval/perplexity_len_1024: 298.938] [train_eval/loss_avg_len_512: 5.712] [train_eval/perplexity_len_512: 302.436] +[2025-10-28 17:59:35][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:25:34] [ETA: 2:14:17] [loss: 5.413] [tokens/s: 226374.776] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 17:59:35][train:194][INFO] Running validation... +[2025-10-28 18:01:06][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1534.725] [val/train_update_time: 895.339] [val/loss: 5.398] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.537] [val/val_tokens_per_second: 452412.827] [val/loss_avg_len_2048: 5.398] [val/perplexity_len_2048: 220.867] [val/loss_avg_len_1024: 5.405] [val/perplexity_len_1024: 222.462] [val/loss_avg_len_512: 5.420] [val/perplexity_len_512: 225.896] +[2025-10-28 18:02:02][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:28:01] [ETA: 2:16:47] [loss: 5.301] [tokens/s: 206033.343] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:02:58][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:28:56] [ETA: 2:11:52] [loss: 5.273] [tokens/s: 226617.464] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:02:58][train:194][INFO] Running validation... +[2025-10-28 18:04:28][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1736.966] [val/train_update_time: 1006.819] [val/loss: 5.267] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.563] [val/val_tokens_per_second: 452280.282] [val/loss_avg_len_2048: 5.267] [val/perplexity_len_2048: 193.790] [val/loss_avg_len_1024: 5.275] [val/perplexity_len_1024: 195.341] [val/loss_avg_len_512: 5.291] [val/perplexity_len_512: 198.610] +[2025-10-28 18:05:24][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:31:23] [ETA: 2:13:49] [loss: 5.245] [tokens/s: 206226.294] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:06:20][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:32:19] [ETA: 2:09:16] [loss: 5.151] [tokens/s: 226752.548] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:06:20][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1939.246] [train_eval/train_update_time: 1118.293] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.299] [train_eval/perplexity_len_2048: 200.122] [train_eval/loss_avg_len_1024: 5.307] [train_eval/perplexity_len_1024: 201.800] [train_eval/loss_avg_len_512: 5.322] [train_eval/perplexity_len_512: 204.871] +[2025-10-28 18:06:20][train:194][INFO] Running validation... +[2025-10-28 18:07:50][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1939.246] [val/train_update_time: 1118.293] [val/loss: 5.159] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.510] [val/val_tokens_per_second: 452546.668] [val/loss_avg_len_2048: 5.159] [val/perplexity_len_2048: 174.050] [val/loss_avg_len_1024: 5.168] [val/perplexity_len_1024: 175.577] [val/loss_avg_len_512: 5.186] [val/perplexity_len_512: 178.754] +[2025-10-28 18:07:50][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt... +[2025-10-28 18:07:51][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt. +[2025-10-28 18:07:51][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.443] +[2025-10-28 18:08:47][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:34:46] [ETA: 2:10:47] [loss: 5.107] [tokens/s: 206256.473] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:09:43][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:35:41] [ETA: 2:06:34] [loss: 5.073] [tokens/s: 226657.953] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:09:43][train:194][INFO] Running validation... +[2025-10-28 18:11:13][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2141.922] [val/train_update_time: 1229.772] [val/loss: 5.062] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.533] [val/val_tokens_per_second: 452433.127] [val/loss_avg_len_2048: 5.062] [val/perplexity_len_2048: 157.830] [val/loss_avg_len_1024: 5.071] [val/perplexity_len_1024: 159.347] [val/loss_avg_len_512: 5.090] [val/perplexity_len_512: 162.465] +[2025-10-28 18:12:09][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:38:08] [ETA: 2:07:40] [loss: 5.019] [tokens/s: 206268.387] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:13:05][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:39:04] [ETA: 2:03:43] [loss: 4.965] [tokens/s: 226771.471] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:13:05][train:194][INFO] Running validation... +[2025-10-28 18:14:36][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2344.148] [val/train_update_time: 1341.235] [val/loss: 4.984] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 91.017] [val/val_tokens_per_second: 450028.173] [val/loss_avg_len_2048: 4.984] [val/perplexity_len_2048: 146.091] [val/loss_avg_len_1024: 4.995] [val/perplexity_len_1024: 147.602] [val/loss_avg_len_512: 5.015] [val/perplexity_len_512: 150.648] +[2025-10-28 18:15:32][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:41:31] [ETA: 2:04:33] [loss: 4.966] [tokens/s: 206267.420] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:15:32][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2491.022] [train_eval/train_update_time: 1396.971] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.048] [train_eval/perplexity_len_2048: 155.777] [train_eval/loss_avg_len_1024: 5.057] [train_eval/perplexity_len_1024: 157.044] [train_eval/loss_avg_len_512: 5.074] [train_eval/perplexity_len_512: 159.766] +[2025-10-28 18:16:28][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:42:26] [ETA: 2:00:48] [loss: 4.914] [tokens/s: 226652.272] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:16:28][train:194][INFO] Running validation... +[2025-10-28 18:17:58][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2546.884] [val/train_update_time: 1452.724] [val/loss: 4.917] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.512] [val/val_tokens_per_second: 452536.960] [val/loss_avg_len_2048: 4.917] [val/perplexity_len_2048: 136.564] [val/loss_avg_len_1024: 4.927] [val/perplexity_len_1024: 138.032] [val/loss_avg_len_512: 4.949] [val/perplexity_len_512: 140.976] +[2025-10-28 18:18:54][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:44:53] [ETA: 2:01:21] [loss: 4.898] [tokens/s: 206268.199] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:19:50][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:45:49] [ETA: 1:57:49] [loss: 4.850] [tokens/s: 226664.200] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:19:50][train:194][INFO] Running validation... +[2025-10-28 18:21:20][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 2749.101] [val/train_update_time: 1564.203] [val/loss: 4.861] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.514] [val/val_tokens_per_second: 452527.805] [val/loss_avg_len_2048: 4.861] [val/perplexity_len_2048: 129.097] [val/loss_avg_len_1024: 4.872] [val/perplexity_len_1024: 130.570] [val/loss_avg_len_512: 4.894] [val/perplexity_len_512: 133.488] +[2025-10-28 18:22:16][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:48:15] [ETA: 1:58:08] [loss: 4.818] [tokens/s: 206279.914] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:23:12][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:49:11] [ETA: 1:54:46] [loss: 4.798] [tokens/s: 226778.201] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:23:12][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2951.316] [train_eval/train_update_time: 1675.688] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.882] [train_eval/perplexity_len_2048: 131.873] [train_eval/loss_avg_len_1024: 4.890] [train_eval/perplexity_len_1024: 132.942] [train_eval/loss_avg_len_512: 4.909] [train_eval/perplexity_len_512: 135.482] +[2025-10-28 18:23:12][train:194][INFO] Running validation... +[2025-10-28 18:24:42][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 2951.316] [val/train_update_time: 1675.688] [val/loss: 4.812] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.517] [val/val_tokens_per_second: 452511.126] [val/loss_avg_len_2048: 4.812] [val/perplexity_len_2048: 122.947] [val/loss_avg_len_1024: 4.824] [val/perplexity_len_1024: 124.414] [val/loss_avg_len_512: 4.847] [val/perplexity_len_512: 127.337] +[2025-10-28 18:24:42][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt... +[2025-10-28 18:24:43][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt. +[2025-10-28 18:24:43][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.436] +[2025-10-28 18:25:39][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:51:38] [ETA: 1:54:55] [loss: 4.810] [tokens/s: 206285.423] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:26:35][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:52:33] [ETA: 1:51:42] [loss: 4.744] [tokens/s: 226677.414] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:26:35][train:194][INFO] Running validation... +[2025-10-28 18:28:05][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3153.973] [val/train_update_time: 1787.177] [val/loss: 4.760] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.763] [val/val_tokens_per_second: 451284.325] [val/loss_avg_len_2048: 4.760] [val/perplexity_len_2048: 116.720] [val/loss_avg_len_1024: 4.772] [val/perplexity_len_1024: 118.146] [val/loss_avg_len_512: 4.796] [val/perplexity_len_512: 120.981] +[2025-10-28 18:29:01][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:55:00] [ETA: 1:51:41] [loss: 4.759] [tokens/s: 206236.434] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:29:57][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:55:56] [ETA: 1:48:35] [loss: 4.724] [tokens/s: 226734.784] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:29:57][train:194][INFO] Running validation... +[2025-10-28 18:31:28][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3356.451] [val/train_update_time: 1898.671] [val/loss: 4.717] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.574] [val/val_tokens_per_second: 452225.537] [val/loss_avg_len_2048: 4.717] [val/perplexity_len_2048: 111.799] [val/loss_avg_len_1024: 4.730] [val/perplexity_len_1024: 113.246] [val/loss_avg_len_512: 4.754] [val/perplexity_len_512: 116.073] +[2025-10-28 18:32:24][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:58:22] [ETA: 1:48:25] [loss: 4.704] [tokens/s: 206325.847] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:32:24][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3502.878] [train_eval/train_update_time: 1954.413] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.752] [train_eval/perplexity_len_2048: 115.806] [train_eval/loss_avg_len_1024: 4.763] [train_eval/perplexity_len_1024: 117.122] [train_eval/loss_avg_len_512: 4.785] [train_eval/perplexity_len_512: 119.713] +[2025-10-28 18:33:19][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 0:59:18] [ETA: 1:45:26] [loss: 4.634] [tokens/s: 226723.796] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:33:19][train:194][INFO] Running validation... +[2025-10-28 18:34:50][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 3558.727] [val/train_update_time: 2010.155] [val/loss: 4.678] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.670] [val/val_tokens_per_second: 451747.243] [val/loss_avg_len_2048: 4.678] [val/perplexity_len_2048: 107.532] [val/loss_avg_len_1024: 4.691] [val/perplexity_len_1024: 108.985] [val/loss_avg_len_512: 4.717] [val/perplexity_len_512: 111.799] +[2025-10-28 18:35:46][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:01:45] [ETA: 1:45:08] [loss: 4.657] [tokens/s: 206294.493] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:36:42][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:02:41] [ETA: 1:42:16] [loss: 4.635] [tokens/s: 226682.610] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:36:42][train:194][INFO] Running validation... +[2025-10-28 18:38:12][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 3761.110] [val/train_update_time: 2121.661] [val/loss: 4.641] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.679] [val/val_tokens_per_second: 451703.774] [val/loss_avg_len_2048: 4.641] [val/perplexity_len_2048: 103.622] [val/loss_avg_len_1024: 4.655] [val/perplexity_len_1024: 105.078] [val/loss_avg_len_512: 4.681] [val/perplexity_len_512: 107.891] +[2025-10-28 18:39:08][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:05:07] [ETA: 1:41:51] [loss: 4.639] [tokens/s: 206257.892] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:40:04][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:06:03] [ETA: 1:39:05] [loss: 4.546] [tokens/s: 226752.243] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:40:04][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3963.490] [train_eval/train_update_time: 2233.150] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.653] [train_eval/perplexity_len_2048: 104.895] [train_eval/loss_avg_len_1024: 4.664] [train_eval/perplexity_len_1024: 106.056] [train_eval/loss_avg_len_512: 4.688] [train_eval/perplexity_len_512: 108.609] +[2025-10-28 18:40:04][train:194][INFO] Running validation... +[2025-10-28 18:41:35][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 3963.490] [val/train_update_time: 2233.150] [val/loss: 4.608] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.607] [val/val_tokens_per_second: 452063.792] [val/loss_avg_len_2048: 4.608] [val/perplexity_len_2048: 100.245] [val/loss_avg_len_1024: 4.622] [val/perplexity_len_1024: 101.732] [val/loss_avg_len_512: 4.650] [val/perplexity_len_512: 104.589] +[2025-10-28 18:41:35][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt... +[2025-10-28 18:41:35][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt. +[2025-10-28 18:41:35][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.443] +[2025-10-28 18:42:31][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:08:30] [ETA: 1:38:34] [loss: 4.568] [tokens/s: 206238.018] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:43:27][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:09:26] [ETA: 1:35:53] [loss: 4.565] [tokens/s: 226680.204] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:43:27][train:194][INFO] Running validation... +[2025-10-28 18:44:57][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4166.243] [val/train_update_time: 2344.623] [val/loss: 4.578] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.583] [val/val_tokens_per_second: 452180.051] [val/loss_avg_len_2048: 4.578] [val/perplexity_len_2048: 97.333] [val/loss_avg_len_1024: 4.594] [val/perplexity_len_1024: 98.844] [val/loss_avg_len_512: 4.622] [val/perplexity_len_512: 101.702] +[2025-10-28 18:45:53][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:11:52] [ETA: 1:35:16] [loss: 4.560] [tokens/s: 206280.440] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:46:49][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:12:48] [ETA: 1:32:39] [loss: 4.582] [tokens/s: 226681.934] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:46:49][train:194][INFO] Running validation... +[2025-10-28 18:48:20][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 4368.526] [val/train_update_time: 2456.116] [val/loss: 4.545] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.743] [val/val_tokens_per_second: 451384.214] [val/loss_avg_len_2048: 4.545] [val/perplexity_len_2048: 94.177] [val/loss_avg_len_1024: 4.561] [val/perplexity_len_1024: 95.677] [val/loss_avg_len_512: 4.590] [val/perplexity_len_512: 98.539] +[2025-10-28 18:49:16][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:15:15] [ETA: 1:31:58] [loss: 4.528] [tokens/s: 206247.110] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:49:16][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4515.121] [train_eval/train_update_time: 2511.850] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.568] [train_eval/perplexity_len_2048: 96.398] [train_eval/loss_avg_len_1024: 4.583] [train_eval/perplexity_len_1024: 97.828] [train_eval/loss_avg_len_512: 4.611] [train_eval/perplexity_len_512: 100.631] +[2025-10-28 18:50:12][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:16:10] [ETA: 1:29:25] [loss: 4.494] [tokens/s: 226662.302] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:50:12][train:194][INFO] Running validation... +[2025-10-28 18:51:42][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 4570.982] [val/train_update_time: 2567.596] [val/loss: 4.518] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.593] [val/val_tokens_per_second: 452130.146] [val/loss_avg_len_2048: 4.518] [val/perplexity_len_2048: 91.612] [val/loss_avg_len_1024: 4.534] [val/perplexity_len_1024: 93.126] [val/loss_avg_len_512: 4.564] [val/perplexity_len_512: 96.009] +[2025-10-28 18:52:38][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:18:37] [ETA: 1:28:39] [loss: 4.520] [tokens/s: 206261.701] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:53:34][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:19:33] [ETA: 1:26:11] [loss: 4.494] [tokens/s: 226681.695] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:53:34][train:194][INFO] Running validation... +[2025-10-28 18:55:05][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 4773.293] [val/train_update_time: 2679.073] [val/loss: 4.491] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.618] [val/val_tokens_per_second: 452007.561] [val/loss_avg_len_2048: 4.491] [val/perplexity_len_2048: 89.181] [val/loss_avg_len_1024: 4.508] [val/perplexity_len_1024: 90.714] [val/loss_avg_len_512: 4.539] [val/perplexity_len_512: 93.629] +[2025-10-28 18:56:00][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:21:59] [ETA: 1:25:20] [loss: 4.490] [tokens/s: 206272.262] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:56:56][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:22:55] [ETA: 1:22:55] [loss: 4.463] [tokens/s: 226784.168] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 18:56:56][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4975.631] [train_eval/train_update_time: 2790.558] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.499] [train_eval/perplexity_len_2048: 89.960] [train_eval/loss_avg_len_1024: 4.514] [train_eval/perplexity_len_1024: 91.275] [train_eval/loss_avg_len_512: 4.544] [train_eval/perplexity_len_512: 94.106] +[2025-10-28 18:56:56][train:194][INFO] Running validation... +[2025-10-28 18:58:27][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 4975.631] [val/train_update_time: 2790.558] [val/loss: 4.466] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.630] [val/val_tokens_per_second: 451946.929] [val/loss_avg_len_2048: 4.466] [val/perplexity_len_2048: 87.051] [val/loss_avg_len_1024: 4.484] [val/perplexity_len_1024: 88.594] [val/loss_avg_len_512: 4.517] [val/perplexity_len_512: 91.528] +[2025-10-28 18:58:27][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt... +[2025-10-28 18:58:27][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt. +[2025-10-28 18:58:27][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.438] +[2025-10-28 18:59:23][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:25:22] [ETA: 1:22:01] [loss: 4.463] [tokens/s: 206262.376] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:00:19][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:26:18] [ETA: 1:19:40] [loss: 4.453] [tokens/s: 226659.051] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:00:19][train:194][INFO] Running validation... +[2025-10-28 19:01:50][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 5178.418] [val/train_update_time: 2902.025] [val/loss: 4.441] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.618] [val/val_tokens_per_second: 452008.856] [val/loss_avg_len_2048: 4.441] [val/perplexity_len_2048: 84.893] [val/loss_avg_len_1024: 4.460] [val/perplexity_len_1024: 86.474] [val/loss_avg_len_512: 4.494] [val/perplexity_len_512: 89.458] +[2025-10-28 19:02:46][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:28:44] [ETA: 1:18:42] [loss: 4.414] [tokens/s: 206256.696] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:03:41][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:29:40] [ETA: 1:16:23] [loss: 4.424] [tokens/s: 226693.969] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:03:41][train:194][INFO] Running validation... +[2025-10-28 19:05:12][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 5380.720] [val/train_update_time: 3013.473] [val/loss: 4.421] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.536] [val/val_tokens_per_second: 452416.614] [val/loss_avg_len_2048: 4.421] [val/perplexity_len_2048: 83.142] [val/loss_avg_len_1024: 4.440] [val/perplexity_len_1024: 84.760] [val/loss_avg_len_512: 4.475] [val/perplexity_len_512: 87.782] +[2025-10-28 19:06:08][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:32:07] [ETA: 1:15:22] [loss: 4.370] [tokens/s: 206300.113] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:06:08][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5527.109] [train_eval/train_update_time: 3069.210] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.433] [train_eval/perplexity_len_2048: 84.219] [train_eval/loss_avg_len_1024: 4.448] [train_eval/perplexity_len_1024: 85.438] [train_eval/loss_avg_len_512: 4.479] [train_eval/perplexity_len_512: 88.131] +[2025-10-28 19:07:04][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:33:02] [ETA: 1:13:06] [loss: 4.411] [tokens/s: 226710.951] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:07:04][train:194][INFO] Running validation... +[2025-10-28 19:08:34][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 5582.959] [val/train_update_time: 3124.938] [val/loss: 4.396] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.502] [val/val_tokens_per_second: 452587.756] [val/loss_avg_len_2048: 4.396] [val/perplexity_len_2048: 81.128] [val/loss_avg_len_1024: 4.416] [val/perplexity_len_1024: 82.762] [val/loss_avg_len_512: 4.452] [val/perplexity_len_512: 85.827] +[2025-10-28 19:09:30][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:35:29] [ETA: 1:12:02] [loss: 4.361] [tokens/s: 206324.776] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:10:26][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:36:25] [ETA: 1:09:49] [loss: 4.409] [tokens/s: 226752.429] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:10:26][train:194][INFO] Running validation... +[2025-10-28 19:11:56][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 5785.136] [val/train_update_time: 3236.387] [val/loss: 4.375] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.566] [val/val_tokens_per_second: 452264.749] [val/loss_avg_len_2048: 4.375] [val/perplexity_len_2048: 79.412] [val/loss_avg_len_1024: 4.396] [val/perplexity_len_1024: 81.097] [val/loss_avg_len_512: 4.434] [val/perplexity_len_512: 84.267] +[2025-10-28 19:12:52][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:38:51] [ETA: 1:08:41] [loss: 4.390] [tokens/s: 206343.676] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:13:48][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:39:47] [ETA: 1:06:31] [loss: 4.390] [tokens/s: 226890.200] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:13:48][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5987.367] [train_eval/train_update_time: 3347.845] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.377] [train_eval/perplexity_len_2048: 79.612] [train_eval/loss_avg_len_1024: 4.393] [train_eval/perplexity_len_1024: 80.855] [train_eval/loss_avg_len_512: 4.428] [train_eval/perplexity_len_512: 83.760] +[2025-10-28 19:13:48][train:194][INFO] Running validation... +[2025-10-28 19:15:19][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 5987.367] [val/train_update_time: 3347.845] [val/loss: 4.356] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.808] [val/val_tokens_per_second: 451063.348] [val/loss_avg_len_2048: 4.356] [val/perplexity_len_2048: 77.921] [val/loss_avg_len_1024: 4.378] [val/perplexity_len_1024: 79.645] [val/loss_avg_len_512: 4.417] [val/perplexity_len_512: 82.879] +[2025-10-28 19:15:19][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt... +[2025-10-28 19:15:19][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt. +[2025-10-28 19:15:19][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.446] +[2025-10-28 19:16:15][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:42:14] [ETA: 1:05:22] [loss: 4.355] [tokens/s: 206316.288] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:17:11][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:43:10] [ETA: 1:03:14] [loss: 4.328] [tokens/s: 226735.707] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:17:11][train:194][INFO] Running validation... +[2025-10-28 19:18:41][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 6190.320] [val/train_update_time: 3459.312] [val/loss: 4.334] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.454] [val/val_tokens_per_second: 452824.486] [val/loss_avg_len_2048: 4.334] [val/perplexity_len_2048: 76.238] [val/loss_avg_len_1024: 4.357] [val/perplexity_len_1024: 78.001] [val/loss_avg_len_512: 4.398] [val/perplexity_len_512: 81.297] +[2025-10-28 19:19:37][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:45:36] [ETA: 1:02:01] [loss: 4.339] [tokens/s: 206342.439] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:20:33][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:46:32] [ETA: 0:59:55] [loss: 4.330] [tokens/s: 226742.795] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:20:33][train:194][INFO] Running validation... +[2025-10-28 19:22:04][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 6392.507] [val/train_update_time: 3570.813] [val/loss: 4.316] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.482] [val/val_tokens_per_second: 452687.798] [val/loss_avg_len_2048: 4.316] [val/perplexity_len_2048: 74.856] [val/loss_avg_len_1024: 4.339] [val/perplexity_len_1024: 76.654] [val/loss_avg_len_512: 4.382] [val/perplexity_len_512: 80.018] +[2025-10-28 19:22:59][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:48:58] [ETA: 0:58:40] [loss: 4.303] [tokens/s: 206349.261] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:22:59][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6538.856] [train_eval/train_update_time: 3626.554] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.332] [train_eval/perplexity_len_2048: 76.080] [train_eval/loss_avg_len_1024: 4.354] [train_eval/perplexity_len_1024: 77.764] [train_eval/loss_avg_len_512: 4.393] [train_eval/perplexity_len_512: 80.909] +[2025-10-28 19:23:55][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:49:54] [ETA: 0:56:37] [loss: 4.325] [tokens/s: 226741.379] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:23:55][train:194][INFO] Running validation... +[2025-10-28 19:25:26][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 6594.719] [val/train_update_time: 3682.305] [val/loss: 4.299] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.584] [val/val_tokens_per_second: 452179.132] [val/loss_avg_len_2048: 4.299] [val/perplexity_len_2048: 73.653] [val/loss_avg_len_1024: 4.324] [val/perplexity_len_1024: 75.478] [val/loss_avg_len_512: 4.368] [val/perplexity_len_512: 78.894] +[2025-10-28 19:26:22][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:52:21] [ETA: 0:55:20] [loss: 4.268] [tokens/s: 206324.362] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:27:18][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:53:17] [ETA: 0:53:18] [loss: 4.283] [tokens/s: 226722.267] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:27:18][train:194][INFO] Running validation... +[2025-10-28 19:28:48][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 6797.037] [val/train_update_time: 3793.800] [val/loss: 4.282] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.482] [val/val_tokens_per_second: 452685.977] [val/loss_avg_len_2048: 4.282] [val/perplexity_len_2048: 72.367] [val/loss_avg_len_1024: 4.307] [val/perplexity_len_1024: 74.230] [val/loss_avg_len_512: 4.353] [val/perplexity_len_512: 77.716] +[2025-10-28 19:29:44][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:55:43] [ETA: 0:51:59] [loss: 4.283] [tokens/s: 206324.071] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:30:40][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:56:39] [ETA: 0:49:59] [loss: 4.279] [tokens/s: 226900.231] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:30:40][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6999.246] [train_eval/train_update_time: 3905.298] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.285] [train_eval/perplexity_len_2048: 72.618] [train_eval/loss_avg_len_1024: 4.308] [train_eval/perplexity_len_1024: 74.319] [train_eval/loss_avg_len_512: 4.353] [train_eval/perplexity_len_512: 77.722] +[2025-10-28 19:30:40][train:194][INFO] Running validation... +[2025-10-28 19:32:10][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 6999.246] [val/train_update_time: 3905.298] [val/loss: 4.266] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.416] [val/val_tokens_per_second: 453016.768] [val/loss_avg_len_2048: 4.266] [val/perplexity_len_2048: 71.260] [val/loss_avg_len_1024: 4.292] [val/perplexity_len_1024: 73.148] [val/loss_avg_len_512: 4.340] [val/perplexity_len_512: 76.683] +[2025-10-28 19:32:10][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt... +[2025-10-28 19:32:11][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt. +[2025-10-28 19:32:11][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.443] +[2025-10-28 19:33:07][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 1:59:05] [ETA: 0:48:38] [loss: 4.275] [tokens/s: 206397.827] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:34:02][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:00:01] [ETA: 0:46:40] [loss: 4.251] [tokens/s: 226785.687] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:34:02][train:194][INFO] Running validation... +[2025-10-28 19:35:33][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 7201.856] [val/train_update_time: 4016.813] [val/loss: 4.252] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.413] [val/val_tokens_per_second: 453034.368] [val/loss_avg_len_2048: 4.252] [val/perplexity_len_2048: 70.277] [val/loss_avg_len_1024: 4.280] [val/perplexity_len_1024: 72.211] [val/loss_avg_len_512: 4.328] [val/perplexity_len_512: 75.819] +[2025-10-28 19:36:29][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:02:28] [ETA: 0:45:17] [loss: 4.268] [tokens/s: 206403.190] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:37:25][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:03:24] [ETA: 0:43:21] [loss: 4.253] [tokens/s: 226803.239] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:37:25][train:194][INFO] Running validation... +[2025-10-28 19:38:55][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 7404.001] [val/train_update_time: 4128.299] [val/loss: 4.240] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.461] [val/val_tokens_per_second: 452790.513] [val/loss_avg_len_2048: 4.240] [val/perplexity_len_2048: 69.406] [val/loss_avg_len_1024: 4.268] [val/perplexity_len_1024: 71.360] [val/loss_avg_len_512: 4.318] [val/perplexity_len_512: 75.015] +[2025-10-28 19:39:51][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:05:50] [ETA: 0:41:56] [loss: 4.235] [tokens/s: 206405.713] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:39:51][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7550.324] [train_eval/train_update_time: 4184.042] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.248] [train_eval/perplexity_len_2048: 69.957] [train_eval/loss_avg_len_1024: 4.272] [train_eval/perplexity_len_1024: 71.697] [train_eval/loss_avg_len_512: 4.320] [train_eval/perplexity_len_512: 75.223] +[2025-10-28 19:40:47][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:06:46] [ETA: 0:40:01] [loss: 4.197] [tokens/s: 226835.759] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:40:47][train:194][INFO] Running validation... +[2025-10-28 19:42:17][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 7606.182] [val/train_update_time: 4239.792] [val/loss: 4.229] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.436] [val/val_tokens_per_second: 452919.183] [val/loss_avg_len_2048: 4.229] [val/perplexity_len_2048: 68.645] [val/loss_avg_len_1024: 4.258] [val/perplexity_len_1024: 70.649] [val/loss_avg_len_512: 4.309] [val/perplexity_len_512: 74.386] +[2025-10-28 19:43:13][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:09:12] [ETA: 0:38:35] [loss: 4.262] [tokens/s: 206438.318] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:44:09][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:10:08] [ETA: 0:36:42] [loss: 4.183] [tokens/s: 226855.893] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:44:09][train:194][INFO] Running validation... +[2025-10-28 19:45:39][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 7808.319] [val/train_update_time: 4351.274] [val/loss: 4.219] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.456] [val/val_tokens_per_second: 452816.934] [val/loss_avg_len_2048: 4.219] [val/perplexity_len_2048: 67.958] [val/loss_avg_len_1024: 4.248] [val/perplexity_len_1024: 69.968] [val/loss_avg_len_512: 4.300] [val/perplexity_len_512: 73.721] +[2025-10-28 19:46:35][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:12:34] [ETA: 0:35:14] [loss: 4.212] [tokens/s: 206459.168] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:47:31][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:13:30] [ETA: 0:33:22] [loss: 4.190] [tokens/s: 226972.557] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:47:31][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8010.438] [train_eval/train_update_time: 4462.723] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.221] [train_eval/perplexity_len_2048: 68.081] [train_eval/loss_avg_len_1024: 4.248] [train_eval/perplexity_len_1024: 69.977] [train_eval/loss_avg_len_512: 4.299] [train_eval/perplexity_len_512: 73.594] +[2025-10-28 19:47:31][train:194][INFO] Running validation... +[2025-10-28 19:49:02][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 8010.438] [val/train_update_time: 4462.723] [val/loss: 4.211] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.428] [val/val_tokens_per_second: 452958.454] [val/loss_avg_len_2048: 4.211] [val/perplexity_len_2048: 67.393] [val/loss_avg_len_1024: 4.240] [val/perplexity_len_1024: 69.429] [val/loss_avg_len_512: 4.294] [val/perplexity_len_512: 73.234] +[2025-10-28 19:49:02][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt... +[2025-10-28 19:49:02][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt. +[2025-10-28 19:49:02][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.446] +[2025-10-28 19:49:58][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:15:57] [ETA: 0:31:53] [loss: 4.166] [tokens/s: 206463.686] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:50:54][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:16:52] [ETA: 0:30:02] [loss: 4.178] [tokens/s: 226873.613] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:50:54][train:194][INFO] Running validation... +[2025-10-28 19:52:24][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 8212.996] [val/train_update_time: 4574.181] [val/loss: 4.203] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.447] [val/val_tokens_per_second: 452860.995] [val/loss_avg_len_2048: 4.203] [val/perplexity_len_2048: 66.881] [val/loss_avg_len_1024: 4.233] [val/perplexity_len_1024: 68.921] [val/loss_avg_len_512: 4.287] [val/perplexity_len_512: 72.739] +[2025-10-28 19:53:20][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:19:19] [ETA: 0:28:32] [loss: 4.211] [tokens/s: 206466.983] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:54:16][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:20:15] [ETA: 0:26:42] [loss: 4.159] [tokens/s: 226881.555] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:54:16][train:194][INFO] Running validation... +[2025-10-28 19:55:46][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 8415.157] [val/train_update_time: 4685.668] [val/loss: 4.197] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.465] [val/val_tokens_per_second: 452771.529] [val/loss_avg_len_2048: 4.197] [val/perplexity_len_2048: 66.470] [val/loss_avg_len_1024: 4.227] [val/perplexity_len_1024: 68.523] [val/loss_avg_len_512: 4.282] [val/perplexity_len_512: 72.362] +[2025-10-28 19:56:42][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:22:41] [ETA: 0:25:10] [loss: 4.228] [tokens/s: 206470.796] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:56:42][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8561.476] [train_eval/train_update_time: 4741.406] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.194] [train_eval/perplexity_len_2048: 66.266] [train_eval/loss_avg_len_1024: 4.217] [train_eval/perplexity_len_1024: 67.816] [train_eval/loss_avg_len_512: 4.269] [train_eval/perplexity_len_512: 71.476] +[2025-10-28 19:57:38][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:23:37] [ETA: 0:23:22] [loss: 4.208] [tokens/s: 226877.197] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 19:57:38][train:194][INFO] Running validation... +[2025-10-28 19:59:08][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 8617.329] [val/train_update_time: 4797.151] [val/loss: 4.192] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.459] [val/val_tokens_per_second: 452800.632] [val/loss_avg_len_2048: 4.192] [val/perplexity_len_2048: 66.143] [val/loss_avg_len_1024: 4.223] [val/perplexity_len_1024: 68.206] [val/loss_avg_len_512: 4.278] [val/perplexity_len_512: 72.068] +[2025-10-28 20:00:04][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:26:03] [ETA: 0:21:49] [loss: 4.153] [tokens/s: 206464.190] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:01:00][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:26:59] [ETA: 0:20:02] [loss: 4.171] [tokens/s: 226870.062] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:01:00][train:194][INFO] Running validation... +[2025-10-28 20:02:31][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 8819.515] [val/train_update_time: 4908.649] [val/loss: 4.188] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.859] [val/val_tokens_per_second: 450806.261] [val/loss_avg_len_2048: 4.188] [val/perplexity_len_2048: 65.883] [val/loss_avg_len_1024: 4.219] [val/perplexity_len_1024: 67.948] [val/loss_avg_len_512: 4.274] [val/perplexity_len_512: 71.819] +[2025-10-28 20:03:27][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:29:26] [ETA: 0:18:28] [loss: 4.226] [tokens/s: 206367.168] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:04:23][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:30:22] [ETA: 0:16:42] [loss: 4.142] [tokens/s: 226851.795] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:04:23][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9022.127] [train_eval/train_update_time: 5020.160] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.187] [train_eval/perplexity_len_2048: 65.827] [train_eval/loss_avg_len_1024: 4.214] [train_eval/perplexity_len_1024: 67.593] [train_eval/loss_avg_len_512: 4.267] [train_eval/perplexity_len_512: 71.341] +[2025-10-28 20:04:23][train:194][INFO] Running validation... +[2025-10-28 20:05:53][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 9022.127] [val/train_update_time: 5020.160] [val/loss: 4.185] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.705] [val/val_tokens_per_second: 451573.310] [val/loss_avg_len_2048: 4.185] [val/perplexity_len_2048: 65.690] [val/loss_avg_len_1024: 4.216] [val/perplexity_len_1024: 67.765] [val/loss_avg_len_512: 4.272] [val/perplexity_len_512: 71.654] +[2025-10-28 20:05:53][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt... +[2025-10-28 20:05:54][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt. +[2025-10-28 20:05:54][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.440] +[2025-10-28 20:06:50][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:32:49] [ETA: 0:15:06] [loss: 4.176] [tokens/s: 206302.742] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:07:46][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 2:33:44] [ETA: 0:13:22] [loss: 4.205] [tokens/s: 226670.892] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:07:46][train:194][INFO] Running validation... +[2025-10-28 20:09:16][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 9224.990] [val/train_update_time: 5131.661] [val/loss: 4.183] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.654] [val/val_tokens_per_second: 451825.741] [val/loss_avg_len_2048: 4.183] [val/perplexity_len_2048: 65.552] [val/loss_avg_len_1024: 4.214] [val/perplexity_len_1024: 67.634] [val/loss_avg_len_512: 4.270] [val/perplexity_len_512: 71.538] +[2025-10-28 20:10:12][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 2:36:11] [ETA: 0:11:45] [loss: 4.189] [tokens/s: 206250.526] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:11:08][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 2:37:07] [ETA: 0:10:01] [loss: 4.146] [tokens/s: 226618.395] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:11:08][train:194][INFO] Running validation... +[2025-10-28 20:12:39][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 9427.380] [val/train_update_time: 5243.172] [val/loss: 4.182] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.673] [val/val_tokens_per_second: 451732.003] [val/loss_avg_len_2048: 4.182] [val/perplexity_len_2048: 65.464] [val/loss_avg_len_1024: 4.213] [val/perplexity_len_1024: 67.548] [val/loss_avg_len_512: 4.269] [val/perplexity_len_512: 71.453] +[2025-10-28 20:13:35][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 2:39:33] [ETA: 0:08:23] [loss: 4.169] [tokens/s: 206203.646] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:13:35][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9573.927] [train_eval/train_update_time: 5298.921] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 4.175] [train_eval/perplexity_len_2048: 65.014] [train_eval/loss_avg_len_1024: 4.204] [train_eval/perplexity_len_1024: 66.984] [train_eval/loss_avg_len_512: 4.258] [train_eval/perplexity_len_512: 70.662] +[2025-10-28 20:14:30][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 2:40:29] [ETA: 0:06:41] [loss: 4.167] [tokens/s: 226555.965] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:14:30][train:194][INFO] Running validation... +[2025-10-28 20:16:01][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 9629.799] [val/train_update_time: 5354.673] [val/loss: 4.181] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.827] [val/val_tokens_per_second: 450966.458] [val/loss_avg_len_2048: 4.181] [val/perplexity_len_2048: 65.413] [val/loss_avg_len_1024: 4.212] [val/perplexity_len_1024: 67.495] [val/loss_avg_len_512: 4.268] [val/perplexity_len_512: 71.400] +[2025-10-28 20:16:57][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 2:42:56] [ETA: 0:05:02] [loss: 4.191] [tokens/s: 206126.025] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:17:53][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 2:43:52] [ETA: 0:03:20] [loss: 4.167] [tokens/s: 226569.337] [batches/s: 0.108] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 20:17:53][train:194][INFO] Running validation... +[2025-10-28 20:19:24][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 9832.331] [val/train_update_time: 5466.160] [val/loss: 4.180] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 90.653] [val/val_tokens_per_second: 451832.038] [val/loss_avg_len_2048: 4.180] [val/perplexity_len_2048: 65.391] [val/loss_avg_len_1024: 4.212] [val/perplexity_len_1024: 67.476] [val/loss_avg_len_512: 4.268] [val/perplexity_len_512: 71.383] +[2025-10-28 20:19:24][train:854][INFO] Training finished with 2055208960 tokens! diff --git a/metrics/jsonlines/checkpoint.jsonl b/metrics/jsonlines/checkpoint.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..355ae15142ad0d7f681f75336bdc1cdfb6b9e230 --- /dev/null +++ b/metrics/jsonlines/checkpoint.jsonl @@ -0,0 +1,9 @@ +{"step": 209715200, "checkpoint/checkpoint_time": 0.45219888899009675} +{"step": 419430400, "checkpoint/checkpoint_time": 0.44284954003524035} +{"step": 629145600, "checkpoint/checkpoint_time": 0.4364313690457493} +{"step": 838860800, "checkpoint/checkpoint_time": 0.44323001499287784} +{"step": 1048576000, "checkpoint/checkpoint_time": 0.43819961103145033} +{"step": 1258291200, "checkpoint/checkpoint_time": 0.4460486189927906} +{"step": 1468006400, "checkpoint/checkpoint_time": 0.4434796510031447} +{"step": 1677721600, "checkpoint/checkpoint_time": 0.4459765850333497} +{"step": 1887436800, "checkpoint/checkpoint_time": 0.439551091985777} diff --git a/metrics/jsonlines/model_info.jsonl b/metrics/jsonlines/model_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..426e245b66edf44c30ffa81a32ba5bfb97c32b19 --- /dev/null +++ b/metrics/jsonlines/model_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "model_info/total_params": 27447040, "model_info/trainable_params": 27447040, "model_info/embedding_params": 12870912, "model_info/flops_per_token": 0, "model_info/non_embedding_params": 14576128} diff --git a/metrics/jsonlines/norm.jsonl b/metrics/jsonlines/norm.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..715439672feeb005843df4b1340bd64957cf2837 --- /dev/null +++ b/metrics/jsonlines/norm.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 71.96881103515625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.04931541904807091, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.992536544799805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0013542291708290577, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.17136812210083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.005112924613058567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.1492719650268555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.005361475981771946, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.148346900939941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.035695917904376984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.15713357925415, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.04859045520424843, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.987136840820312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0012968636583536863, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 12.615348815917969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.04289032518863678, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 8.929548263549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.04455175995826721, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.001537322998047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.001499427598901093, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.157523155212402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.002494624350219965, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.185312747955322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0025095785968005657, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.166274547576904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07670677453279495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.143224239349365, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.16745999455451965, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.005922317504883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0025546816177666187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 12.627293586730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06328149139881134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 8.90603256225586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07561039924621582, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.017423629760742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0626574233174324, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 72.25432586669922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.9708001613616943} +{"step": 41943040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 73.00873565673828, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.04540559649467468, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.99699592590332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.001184971770271659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.4577507972717285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0073115890845656395, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.446045875549316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0047165765427052975, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2812819480896, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.02422620914876461, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.282985687255859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.02803080715239048, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.979214668273926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0007764737820252776, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 12.957905769348145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.021229157224297523, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.17586898803711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.02205784060060978, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.006433486938477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0005142858135513961, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.340756416320801, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0021930222865194082, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.370674133300781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.002266782335937023, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.303278923034668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.01713688112795353, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.2848005294799805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.039098870009183884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.983872413635254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0012423351872712374, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 12.90445613861084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.02115527354180813, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.110078811645508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.02893497608602047, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.15167236328125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.07554987818002701, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 80.06192016601562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.9559194445610046} +{"step": 62914560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 74.06412506103516, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14570799469947815, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.025676727294922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006497068330645561, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.7369303703308105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024981895461678505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.785250663757324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.014619163237512112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.387188911437988, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.14279860258102417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.378757476806641, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20343703031539917, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.988591194152832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.012657159008085728, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.171303749084473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.26851335167884827, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.33739948272705, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.24685905873775482, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.001752853393555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.012380953878164291, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.451322555541992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01307487953454256, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.480711936950684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.012583089992403984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.373332977294922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16855405271053314, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.365622520446777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1301959902048111, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.949563980102539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0034700720570981503, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.050446510314941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10003197193145752, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.23080062866211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07676425576210022, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.297536849975586, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01719949021935463, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 89.5250473022461, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.21638067066669464} +{"step": 83886080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 75.31285858154297, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16364356875419617, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.04182243347168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007253430318087339, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.847330093383789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03726540133357048, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.9215497970581055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02584674209356308, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.4670538902282715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15083353221416473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.449717044830322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15970012545585632, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.002822875976562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00822343397885561, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.299022674560547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1287170946598053, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.436797142028809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13184413313865662, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.990812301635742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0060913944616913795, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.475803375244141, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017078761011362076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.49263858795166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.018205078318715096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.390185356140137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12095610797405243, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.391383647918701, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09397519379854202, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.919920921325684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004322127439081669, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.12956714630127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08771936595439911, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.288615226745605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.058143895119428635, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.415796279907227, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.020636754110455513, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 96.9266357421875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.17376863956451416} +{"step": 104857600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 76.7607650756836, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1095752939581871, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.053104400634766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004911662545055151, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.933731555938721, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023956341668963432, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.015653133392334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.018695170059800148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.532196998596191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10146733373403549, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.510896682739258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10437212139368057, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.010652542114258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004242561757564545, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.401557922363281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.07276659458875656, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.518556594848633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07721903175115585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.983316421508789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033208606764674187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.494424343109131, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01155451126396656, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.504161357879639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.009464182890951633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.402824401855469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07152801752090454, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.411536693572998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.050488363951444626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.890852928161621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0020276005379855633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.18061351776123, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0522647462785244, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.323259353637695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03321206569671631, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.52654457092285, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.025653725489974022, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 102.65660095214844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1207268089056015} +{"step": 125829120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 78.2613525390625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1533873975276947, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.062118530273438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006123389583081007, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.019736289978027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02470112219452858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.107110023498535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02203185297548771, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.577709674835205, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.14341706037521362, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.553047180175781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1553446650505066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.018526077270508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007666276767849922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.500574111938477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13624204695224762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.597522735595703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.15344123542308807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.97288990020752, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006172170862555504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.505858898162842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01544086541980505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.511133670806885, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.018410345539450645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.403059959411621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11929198354482651, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.416811943054199, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0796818733215332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.878570556640625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005289930384606123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.239178657531738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.111038938164711, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.36574935913086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05920734256505966, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.641565322875977, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01901111751794815, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 107.41859436035156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09446175396442413} +{"step": 146800640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 79.72206115722656, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11071759462356567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.070409774780273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00511866994202137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.100142002105713, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025590041652321815, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.1934614181518555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023026639595627785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.612696647644043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11540091782808304, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.586658477783203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12160779535770416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.023832321166992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004102237522602081, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.595293998718262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09045690298080444, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.668807983398438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10221938043832779, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.965648651123047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038313197437673807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.532318592071533, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010848313570022583, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.531759738922119, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.012374603189527988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.403212547302246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08556545525789261, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.421205997467041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.061869893223047256, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.87309455871582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004568798467516899, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.308019638061523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10096898674964905, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.412480354309082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.051755812019109726, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.764677047729492, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.022108783945441246, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 111.91532135009766, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07244172692298889} +{"step": 167772160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 81.1129379272461, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1796851009130478, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.0781192779541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005429583135992289, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.170097827911377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04483795538544655, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.267185688018799, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04343349486589432, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.639550685882568, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13748100399971008, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.614415168762207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15817059576511383, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.024372100830078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005865232553333044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.6838960647583, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11245613545179367, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.737776756286621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09309174865484238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.963200569152832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003122922731563449, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.58037805557251, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0162676814943552, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.572727203369141, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.024125099182128906, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.404136657714844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0779220387339592, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.427477836608887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05740904062986374, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.870956420898438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0028062262572348118, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.386609077453613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.056244973093271255, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.468461990356445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.036231763660907745, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.89800262451172, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.03039810247719288, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 116.54136657714844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.11136391013860703} +{"step": 188743680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 82.40432739257812, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.20313775539398193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.087766647338867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006288954988121986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.233892440795898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04738529771566391, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.335125923156738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05019400641322136, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.661988258361816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15726734697818756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.637311935424805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16753843426704407, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.02113151550293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004120918922126293, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.753386497497559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11086106300354004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.791796684265137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09202718734741211, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.962051391601562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.002291247481480241, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.618758201599121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017351452261209488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.608297824859619, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023433219641447067, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.407563209533691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06660616397857666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4361653327941895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06274056434631348, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.87038803100586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004131942521780729, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.462454795837402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09324252605438232, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.52087116241455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04682634398341179, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.03158950805664, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02452894113957882, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 121.200439453125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09074628353118896} +{"step": 209715200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 83.5911865234375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1453482061624527, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.096412658691406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007312366738915443, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.288326740264893, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.038505759090185165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.391077041625977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.036430198699235916, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6792426109313965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1797201931476593, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.654862880706787, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1823611557483673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.017803192138672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005555106792598963, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.815157890319824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10936065763235092, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.835993766784668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10063432157039642, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.958380699157715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0035226098261773586, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.645070552825928, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.015789225697517395, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.633759021759033, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02194344997406006, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4094953536987305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08977562189102173, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.441890239715576, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.085102379322052, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.874677658081055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004627760034054518, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.550374031066895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09385762363672256, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.57552719116211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0534711517393589, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.163379669189453, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.019790584221482277, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 125.79393005371094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07696204632520676} +{"step": 230686720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 84.6982421875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.24278350174427032, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.103723526000977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.008567415177822113, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.339325428009033, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.050840314477682114, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.4431328773498535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05066298693418503, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.690237522125244, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2569338381290436, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.66634464263916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2869552671909332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.01541519165039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.009857721626758575, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.874824523925781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17908364534378052, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.874707221984863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.19347873330116272, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.95538330078125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.008452638052403927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.671886444091797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.04281783476471901, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.658700466156006, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.06399498879909515, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.412990093231201, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.21892395615577698, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.448991775512695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1554325670003891, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.879461288452148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.010708083398640156, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.636224746704102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.25918641686439514, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.627909660339355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.13216489553451538, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.296672821044922, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.014954748563468456, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 130.3278350830078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1297391802072525} +{"step": 251658240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 85.73194885253906, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.25144127011299133, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.114240646362305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007921196520328522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.396145343780518, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.061982233077287674, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.4991631507873535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.06626999378204346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.70429801940918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22427751123905182, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.680352687835693, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22564180195331573, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.013357162475586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.006179012358188629, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.926775932312012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13036735355854034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.904118537902832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10914716124534607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.953721046447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004064115695655346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.700965404510498, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02171570621430874, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.68501091003418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03485000878572464, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.416916370391846, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11508917063474655, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.454395294189453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0851665511727333, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.883078575134277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00388674926944077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.710719108581543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0944691002368927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.668289184570312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.054243650287389755, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.426307678222656, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02101185731589794, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 134.70834350585938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07191072404384613} +{"step": 272629760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 86.70880126953125, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1455005258321762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.121713638305664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005037820432335138, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.4493794441223145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027906179428100586, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.550784587860107, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.029100481420755386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.7121405601501465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1604865938425064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.689065933227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1711036115884781, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.009763717651367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004334672819823027, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.971251487731934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10789415240287781, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.927563667297363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09312722831964493, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.95372200012207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.002948329783976078, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.739963054656982, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014666754752397537, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.719630718231201, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020532013848423958, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.420926094055176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09444110095500946, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4598002433776855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06923374533653259, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.892605781555176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038320834282785654, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.79763412475586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07700145244598389, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.71834659576416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05152285844087601, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.560956954956055, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02319270931184292, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 139.0189971923828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06287340819835663} +{"step": 293601280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 87.63658142089844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.22617410123348236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.130027770996094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00787591002881527, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.505527019500732, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.048333484679460526, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.604154109954834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.062389228492975235, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.718601226806641, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21073172986507416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.696670055389404, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22032512724399567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.007959365844727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007029002998024225, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.01451587677002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14386828243732452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.95072078704834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12195581942796707, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.954179763793945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004385112784802914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.77263879776001, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021794306114315987, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.7480268478393555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.026114700362086296, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.426813125610352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1299847662448883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.465837001800537, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09726642817258835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.899656295776367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004842208698391914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.875143051147461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11108451336622238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.76108169555664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0743962973356247, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.692630767822266, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.026970671489834785, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 143.24365234375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09441133588552475} +{"step": 314572800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 88.52363586425781, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14518576860427856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.13727569580078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005504499189555645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.559789657592773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02844421938061714, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.6528706550598145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03473065048456192, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.721526145935059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15911899507045746, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.701378345489502, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1772950440645218, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.004396438598633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004317618906497955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.05009937286377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1160016730427742, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.968083381652832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09560858458280563, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.956204414367676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0043249716982245445, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.809760093688965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024099618196487427, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.779805660247803, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04022610932588577, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.431701183319092, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11262813210487366, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.47166633605957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07552766054868698, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.908373832702637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0030005378648638725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.95205020904541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07858195900917053, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.802312850952148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05145828425884247, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.825944900512695, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.018087036907672882, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 147.37281799316406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05697784945368767} +{"step": 335544320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 89.37433624267578, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.28467684984207153, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.143476486206055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.012214032001793385, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.615607261657715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.051451995968818665, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.703514099121094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.055474020540714264, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.72066593170166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.34767815470695496, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.702209949493408, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.3733818531036377, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.001157760620117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.011805770918726921, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.083739280700684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.22624018788337708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.984454154968262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.238439679145813, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.958950996398926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.010555370710790157, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.853730201721191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.059041693806648254, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.8161211013793945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.08985928446054459, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.435088634490967, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.29597005248069763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.476139545440674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.18294748663902283, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.920849800109863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.013887499459087849, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.034357070922852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.33550727367401123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.847965240478516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.21444472670555115, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.961162567138672, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012999589554965496, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 151.40452575683594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.15780587494373322} +{"step": 356515840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 90.18443298339844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14825716614723206, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.15260124206543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0055515156127512455, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.673725605010986, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03618498519062996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.7537736892700195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04004652425646782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.723803997039795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1770779937505722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.70619535446167, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1781948059797287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.998382568359375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037475263234227896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.1156587600708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11085280776023865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.999588966369629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09365847706794739, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.962311744689941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0032168834004551172, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.896274566650391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014687265269458294, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.852324485778809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02135533094406128, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.439433574676514, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1041664108633995, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.480526924133301, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0737786591053009, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.925668716430664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0025748719926923513, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.096073150634766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07248403131961823, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.878215789794922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04901432618498802, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.09047508239746, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01878253184258938, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 155.2642822265625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.062363769859075546} +{"step": 377487360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 90.9585952758789, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2328650802373886, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.15746307373047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.008833866566419601, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.724483013153076, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04813268408179283, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.797143936157227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.061777375638484955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.7211761474609375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24778306484222412, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.704392910003662, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.27277901768684387, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.994621276855469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0062265037558972836, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.14212417602539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17500121891498566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.0114107131958, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.16883039474487305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.967443466186523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.007503010798245668, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.948249816894531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.047980908304452896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.895962715148926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.08297993987798691, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.442494869232178, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.22912399470806122, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.484551906585693, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11802435666322708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.936209678649902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.008192856796085835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.16471004486084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.2107938528060913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.91453742980957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.13764220476150513, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.222713470458984, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.013275728560984135, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 158.9774169921875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.12194360047578812} +{"step": 398458880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 91.69650268554688, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2201758772134781, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.164318084716797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0075109489262104034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.779774188995361, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.05122173950076103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.844388961791992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05757281556725502, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.718875885009766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24322636425495148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.70241641998291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23457999527454376, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.991876602172852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0059088910929858685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.168084144592285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1468929499387741, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.023483276367188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13976360857486725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.971948623657227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005393619649112225, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.997969150543213, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019937748089432716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.93637752532959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.029993396252393723, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.444275856018066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16872011125087738, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4868974685668945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09849688410758972, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.945151329040527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005058852024376392, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.223868370056152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12786762416362762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.945886611938477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08843729645013809, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.353605270385742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012491824105381966, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 162.53453063964844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06327365338802338} +{"step": 419430400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 92.39932250976562, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1680549830198288, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.169025421142578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006852291990071535, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.830728530883789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.033888641744852066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.886474132537842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03885461390018463, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.714014530181885, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22505778074264526, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.698148250579834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.239058256149292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.988059043884277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005139353685081005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.189455032348633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16138644516468048, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.033190727233887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1449490189552307, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.978510856628418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0065771108493208885, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.056424617767334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.036378730088472366, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.983146667480469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05544789135456085, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.446536064147949, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1946994960308075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.490187168121338, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10826177150011063, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.956576347351074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.007103135343641043, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.285435676574707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.18758662045001984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.978485107421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11872214078903198, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.484943389892578, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011321219615638256, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 165.94651794433594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08833885937929153} +{"step": 440401920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 93.06600952148438, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2010613977909088, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.175485610961914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006980763748288155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.88422155380249, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04449654743075371, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.930418491363525, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05218619480729103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.711917877197266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2103203386068344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.696521759033203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20228780806064606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.98564624786377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004274958278983831, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.212506294250488, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13061347603797913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.043774604797363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1084963008761406, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.984313011169434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003685127245262265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.110601902008057, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018817763775587082, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.025972843170166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03080081380903721, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.44865608215332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11889065057039261, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4928436279296875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07778062671422958, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.964630126953125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0031336834654212, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.339262962341309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08841850608587265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.00547981262207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06114357337355614, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.610069274902344, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.015161347575485706, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 169.17222595214844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.063273586332798} +{"step": 461373440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 93.70323181152344, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16142737865447998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.17934799194336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006299309432506561, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.930805683135986, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03951982036232948, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.967745304107666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04422302916646004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.706854343414307, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1967434287071228, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.692520618438721, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20353470742702484, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.981207847595215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003911141771823168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.23015022277832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13226190209388733, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.051820755004883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1139034628868103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.990880012512207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004099501296877861, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.1686882972717285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021554479375481606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.071751117706299, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03163975104689598, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.450058937072754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1376882940530777, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.494927406311035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08997339010238647, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.97465991973877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038366089574992657, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.392858505249023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10957576334476471, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.034111976623535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0725448876619339, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.736526489257812, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.015224740840494633, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 172.2420654296875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06378278881311417} +{"step": 482344960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 94.31307983398438, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.21966829895973206, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.184221267700195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0075301178731024265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.981439113616943, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04017576947808266, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.009094715118408, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05636826902627945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.701688289642334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24584785103797913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.687946319580078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24552220106124878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.977657318115234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00653364323079586, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.24791431427002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15983892977237701, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.060454368591309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13897688686847687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.998174667358398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005133699160069227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.230320453643799, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0313861221075058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.11881685256958, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04820042476058006, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.451635360717773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19189713895320892, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.497639179229736, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10846739262342453, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.983004570007324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.007082642987370491, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.442118644714355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.18217507004737854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.0597562789917, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.12372582405805588, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.861595153808594, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011241644620895386, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 175.1822052001953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08617229014635086} +{"step": 503316480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 94.89573669433594, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1712370067834854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.188575744628906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006607190705835819, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.028829097747803, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03527326509356499, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.047359466552734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.047497935593128204, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6965508460998535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2028111070394516, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.683154582977295, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20247499644756317, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.97277545928955, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004316946491599083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.262228012084961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1328960508108139, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.066876411437988, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10702762752771378, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.005638122558594, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003801940707489848, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.292390823364258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02974044345319271, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.165397644042969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04228688031435013, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.452298641204834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.13455672562122345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.499255180358887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08698280155658722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.991625785827637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004058151971548796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.488505363464355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11285441368818283, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.084967613220215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07706204056739807, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.985727310180664, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01582559011876583, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 177.9745330810547, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0727604329586029} +{"step": 524288000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 95.45271301269531, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.23732990026474, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.191099166870117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00825822725892067, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.073116302490234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04826023429632187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.081705570220947, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.07070185244083405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.689655780792236, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2775220572948456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.677095890045166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.29090866446495056, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.96892261505127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0076284026727080345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.276958465576172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18994402885437012, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.073484420776367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1759869009256363, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.014497756958008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.007713458966463804, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.3590593338012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.04155610129237175, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.214974403381348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.06394924968481064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.453242301940918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.25872695446014404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.501589298248291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.15370085835456848, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.999932289123535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.009496084414422512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.533647537231445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.24939294159412384, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.108856201171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.18772470951080322, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.109777450561523, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009036057628691196, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 180.6318359375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.11212196201086044} +{"step": 545259520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 95.98651123046875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.20254817605018616, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.19583511352539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006716694217175245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.122664451599121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04492424428462982, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.1211981773376465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.052827537059783936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6850104331970215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21279770135879517, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.672711372375488, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23539121448993683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.964903831481934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004528492223471403, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.29043960571289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16126446425914764, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.079767227172852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1313309371471405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.02295684814453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0048791044391691685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.420499801635742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.03336570784449577, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.258930206298828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05470510944724083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.455070972442627, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17101222276687622, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.503600120544434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10420992225408554, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.005474090576172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00420422712340951, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.569985389709473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12240609526634216, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.127154350280762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07970206439495087, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.228233337402344, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01466356497257948, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 183.1226348876953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08560989797115326} +{"step": 566231040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 96.49903106689453, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13120995461940765, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.1982421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005079254508018494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.1641387939453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025846628472208977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.153404712677002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03033231943845749, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.678544044494629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1640319973230362, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.666733264923096, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17138276994228363, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.960182189941406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037279766984283924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.301491737365723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11465294659137726, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.084800720214844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09740757942199707, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.032527923583984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0034293150529265404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.488356113433838, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01841432973742485, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.307281494140625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030126623809337616, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.455514430999756, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12164194881916046, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.505432605743408, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08276025950908661, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.012414932250977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0035163930151611567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.606804847717285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10212044417858124, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.146543502807617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07980071008205414, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.34813690185547, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016239671036601067, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 185.50848388671875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06167973577976227} +{"step": 587202560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 96.99108123779297, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18574506044387817, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.201723098754883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007745443377643824, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.208592414855957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03852919861674309, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.188491344451904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04602735862135887, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.672460079193115, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2550320029258728, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.660647392272949, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24593262374401093, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.956066131591797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0059293475933372974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.313240051269531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1650962084531784, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.089727401733398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13452410697937012, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.042007446289062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00515463063493371, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.555483818054199, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02539820596575737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.354790687561035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03896045312285423, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.456128120422363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1599791795015335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.506613254547119, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10217062383890152, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.01720428466797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005605583544820547, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.638766288757324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.147275909781456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.162490844726562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11143698543310165, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.466142654418945, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016151513904333115, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 187.7777099609375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07068943977355957} +{"step": 608174080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 97.4599609375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17528429627418518, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.202938079833984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006035146303474903, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.2470598220825195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.036417338997125626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.2176408767700195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.043420713394880295, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.665144920349121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20559710264205933, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6532673835754395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20689933001995087, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.951339721679688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0038451727014034986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.323660850524902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1408054083585739, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.094762802124023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1172800064086914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.052467346191406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004621406551450491, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.625240802764893, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.027776598930358887, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.403381824493408, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04674037918448448, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.456311225891113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16201691329479218, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.507902145385742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09898333996534348, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.022104263305664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005596274044364691, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.670418739318848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1525784134864807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.179323196411133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09950178861618042, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.58246421813965, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00944497436285019, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 189.92689514160156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06719357520341873} +{"step": 629145600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 97.90754699707031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.23168328404426575, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.20389175415039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007820853032171726, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.285237789154053, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.05381527915596962, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.247743129730225, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.06384272128343582, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.657105922698975, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.28905582427978516, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6454386711120605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2862836420536041, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.946763038635254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.006367848254740238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.332715034484863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1960151344537735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.0990571975708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1596374362707138, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.065670013427734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006658770143985748, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.703982353210449, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02735951542854309, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.4577131271362305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04526437446475029, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.45742654800415, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.20902396738529205, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.511232376098633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12738649547100067, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.026891708374023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.007890776731073856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.700179100036621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.21689678728580475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.194891929626465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.15271611511707306, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.698524475097656, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008252336643636227, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 191.9667205810547, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08972136676311493} +{"step": 650117120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 98.33528137207031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1776426136493683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.205472946166992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005951357539743185, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.322971343994141, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.036454953253269196, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.277576923370361, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04310980439186096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.649623394012451, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22590728104114532, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.637854099273682, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22840604186058044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.94194507598877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004632596392184496, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.340765953063965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1606854647397995, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.102707862854004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12634794414043427, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.07936668395996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004855686333030462, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.786111831665039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024119826033711433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.513904094696045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04213641211390495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.458155632019043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16831597685813904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.513454914093018, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1012340635061264, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.030269622802734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004544335883110762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.724579811096191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13378988206386566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.20791244506836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09025018662214279, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.810543060302734, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01207301951944828, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 193.8886260986328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06658131629228592} +{"step": 671088640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 98.74535369873047, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.21220876276493073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.206472396850586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007842455990612507, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.360204696655273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.047982487827539444, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.306904315948486, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05394575372338295, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.642663478851318, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2386706918478012, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6307454109191895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23097094893455505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.937173843383789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003964495845139027, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.348780632019043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14960020780563354, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.106695175170898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12372305989265442, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.092185974121094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004595243837684393, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.864194393157959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02176246978342533, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.566206455230713, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.038610927760601044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4593329429626465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1545010209083557, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.515524864196777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09666804224252701, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.033048629760742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004512446466833353, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.747512817382812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13066284358501434, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.219199180603027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0924314633011818, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.919540405273438, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011920412071049213, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 195.7066650390625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0758344829082489} +{"step": 692060160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.13819122314453, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16480866074562073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.205385208129883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005713857710361481, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.389603137969971, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.039165619760751724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.329729080200195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0483960323035717, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6344733238220215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20255163311958313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.622137069702148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21132907271385193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.932127952575684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003926005680114031, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.35527229309082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1409759372472763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.109402656555176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10845402628183365, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.10645866394043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004115222953259945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.946592330932617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019540106877684593, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.621492862701416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03342060372233391, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.459921360015869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14696213603019714, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.517569065093994, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09254217147827148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03496551513672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004483784083276987, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.766534805297852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11876687407493591, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.229175567626953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07887900620698929, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.028221130371094, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01205113809555769, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.4371795654297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05460026487708092} +{"step": 713031680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.51319885253906, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18156005442142487, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.2039737701416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007217529229819775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.417907238006592, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04034535959362984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.3519606590271, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05677058920264244, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.626208782196045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2515285313129425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.613320350646973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23776087164878845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.927010536193848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005202191881835461, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.361067771911621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16774038970470428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.11195182800293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13023710250854492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.121057510375977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004251493606716394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.030284881591797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02425825409591198, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.677682399749756, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03510954603552818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.460475921630859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1628333479166031, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.519660949707031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10466677695512772, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.037330627441406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005620517767965794, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.785842895507812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.14863984286785126, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.2393798828125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11219428479671478, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.134916305541992, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012791263870894909, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.0831756591797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06641265004873276} +{"step": 734003200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.87018585205078, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15414318442344666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.202232360839844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005236152559518814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.446627140045166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.034370120614767075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.374228000640869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04010559991002083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.617194175720215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1951070874929428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.603790760040283, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20991767942905426, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.922125816345215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00451241061091423, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.366265296936035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15145112574100494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.114112854003906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12586161494255066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.13711166381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005033711437135935, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.1160173416137695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.029087111353874207, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.73490571975708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05205628275871277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.461193084716797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18206383287906647, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.522063255310059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10155154764652252, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.039077758789062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0065499660558998585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.80341625213623, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.16883361339569092, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.249201774597168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.12374221533536911, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.24036407470703, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007326742634177208, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 200.64645385742188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06610972434282303} +{"step": 754974720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.21111297607422, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1994282752275467, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.202030181884766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007403017021715641, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.4774699211120605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04116962105035782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.398448467254639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04862511530518532, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.609851360321045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2531968951225281, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.595785617828369, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2596298158168793, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.917258262634277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005414800252765417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.371621131896973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1845332533121109, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.116500854492188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14304229617118835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.15176010131836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0052700042724609375, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.196414470672607, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024536311626434326, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.788259029388428, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04325935244560242, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.462017059326172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.198526069521904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.524219512939453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11686907708644867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.038528442382812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004949215333908796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.816523551940918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13513469696044922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.255069732666016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0891907662153244, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.339950561523438, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011903032660484314, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 202.1190185546875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07267915457487106} +{"step": 775946240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.53730010986328, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18191273510456085, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.199987411499023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0061425319872796535, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.503777027130127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03678392618894577, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.4182448387146, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04606831073760986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6021246910095215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2266475260257721, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.587444305419922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24288441240787506, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.911797523498535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005294105038046837, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.374917030334473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17180593311786652, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.117588996887207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13436844944953918, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.167869567871094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004653654992580414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.27996301651001, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.026224210858345032, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.8435587882995605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04749156907200813, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.463163375854492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17080333828926086, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.527059555053711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10656115412712097, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03856086730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004303749185055494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.829835891723633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1288864016532898, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.261215209960938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09293518960475922, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.438434600830078, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01088557206094265, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 203.5233154296875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0541360042989254} +{"step": 796917760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.84901428222656, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.22234544157981873, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.198169708251953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007197101600468159, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.53064489364624, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04752286151051521, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.439026832580566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.06545405089855194, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.593440532684326, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23343104124069214, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.577912330627441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23950088024139404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.90588665008545, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004504786804318428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.37661361694336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16068947315216064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.118048667907715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12627695500850677, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.18511962890625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005279108416289091, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.3682684898376465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0421442836523056, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.902781009674072, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.06436177343130112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.464034557342529, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1843179613351822, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.529521465301514, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11687987297773361, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.039731979370117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004411763045936823, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.843725204467773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1388808935880661, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.268006324768066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10408314317464828, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.537382125854492, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.013428233563899994, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 204.86802673339844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06402511149644852} +{"step": 817889280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.14525604248047, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1992584466934204, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.19477081298828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006387530360370874, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.553878307342529, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.05031290650367737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.456765174865723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05592066049575806, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.584022521972656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22543981671333313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.567984580993652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24309687316417694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.901103973388672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005498434416949749, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.380335807800293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16845422983169556, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.119680404663086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1252947300672531, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.202411651611328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005202039610594511, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.454914569854736, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02424369752407074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.960832595825195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.047481708228588104, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4649457931518555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18174861371517181, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.531742572784424, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09331729263067245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03990936279297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004023337736725807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85560131072998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12000054866075516, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.27381420135498, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08019442856311798, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.63381004333496, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010385950095951557, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 206.1508331298828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06657548993825912} +{"step": 838860800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.42556762695312, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1618088185787201, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.190109252929688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005978195462375879, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.5727338790893555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.037526462227106094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.47122859954834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04670233651995659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.574620246887207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21377038955688477, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.557860851287842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21726453304290771, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.895434379577637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0042589278891682625, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.38162612915039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15382103621959686, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.120190620422363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11992569267749786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.22076416015625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005119440145790577, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.545126438140869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.025165537372231483, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.020203590393066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04395734518766403, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.466522693634033, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17170508205890656, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.534902095794678, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10213257372379303, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.040090560913086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004042539279907942, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.866397857666016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11835458874702454, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.27926254272461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08334948867559433, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.72886848449707, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011318616569042206, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 207.36668395996094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06178222596645355} +{"step": 859832320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.69270324707031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17803464829921722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.18671989440918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007065081037580967, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.595113277435303, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03640790656208992, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.487961292266846, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04753084480762482, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.565410614013672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23360566794872284, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.5477447509765625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24192413687705994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.890655517578125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005586179904639721, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.384032249450684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17019744217395782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.121166229248047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13272099196910858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.23860740661621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005592389963567257, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.634299278259277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024011045694351196, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.078557014465332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.041660625487565994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.467334747314453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18646465241909027, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.537023067474365, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10987342894077301, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.038558959960938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0052468073554337025, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.873376846313477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.14815755188465118, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.283266067504883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10451207309961319, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.821975708007812, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012729720212519169, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 208.52159118652344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06683726608753204} +{"step": 880803840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.94566345214844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17148582637310028, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.181699752807617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005034768022596836, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.612502098083496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.036041345447301865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5008063316345215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.042978063225746155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.556115627288818, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21837274730205536, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.537447929382324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2310246080160141, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.885574340820312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004788029007613659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.385499954223633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1705721765756607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.121695518493652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13514713943004608, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.2573184967041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005185150075703859, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.726024627685547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.030202753841876984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.138467311859131, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05814516916871071, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4681315422058105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19129475951194763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.53985595703125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10376802086830139, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.037891387939453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00719391880556941, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.881969451904297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.17062407732009888, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.28762435913086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1325332373380661, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.912134170532227, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007630117703229189, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 209.62448120117188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07069630175828934} +{"step": 901775360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.18523406982422, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14616721868515015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.17624855041504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005950727965682745, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.6287455558776855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03201168403029442, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.512566089630127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.038528699427843094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.547257900238037, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2037021815776825, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.527703285217285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21330244839191437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.8806791305542, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003990343306213617, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.386221885681152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1521722674369812, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.121869087219238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11724676191806793, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.27541732788086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004244672600179911, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.814859867095947, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021446209400892258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.195769309997559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03439995273947716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.468417644500732, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15713351964950562, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.542151927947998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09732574224472046, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03639793395996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004190485924482346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.887951850891113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11503174901008606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.290661811828613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08129554986953735, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.999858856201172, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010054546408355236, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 210.66513061523438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.055771008133888245} +{"step": 922746880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.41321563720703, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.20159848034381866, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.171186447143555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00654226029291749, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.647594928741455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.043304916471242905, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.526176929473877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.051734477281570435, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.538196563720703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24206972122192383, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.5180230140686035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23705808818340302, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.8746976852417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004654767923057079, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.385666847229004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15711326897144318, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.121122360229492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12160080671310425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.294466018676758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004881387576460838, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.905802249908447, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02641061320900917, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.254460334777832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04303108900785446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.468944072723389, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1805441677570343, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.544395923614502, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0958058163523674, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.035293579101562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00451255589723587, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.894001960754395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12083552777767181, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29396915435791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08566392958164215, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.086444854736328, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010413268581032753, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 211.66525268554688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04639901965856552} +{"step": 943718400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.62932586669922, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12783892452716827, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.1661319732666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005790275055915117, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.665799140930176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03181671351194382, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.539743900299072, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.038688067346811295, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.528384685516357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20084883272647858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.507668972015381, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21146811544895172, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.869917869567871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004352687392383814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.386152267456055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15656299889087677, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.121134757995605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11731177568435669, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.3134765625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004508820362389088, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.994246006011963, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021218178793787956, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.311313152313232, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0385403148829937, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469191074371338, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16056150197982788, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.546472072601318, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09963307529687881, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.033039093017578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004741961602121592, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.897520065307617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13437621295452118, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.295130729675293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08767924457788467, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.170745849609375, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008474631235003471, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 212.61338806152344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04943064972758293} +{"step": 964689920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.83467102050781, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15505686402320862, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.160465240478516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005748676136136055, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.680539608001709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0377473384141922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.550192356109619, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04405076801776886, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.51878547668457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2128905951976776, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.497228622436523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2236844301223755, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.864997863769531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0042264717631042, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.386139869689941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16240070760250092, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.120826721191406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12551476061344147, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.331953048706055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0047105285339057446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.081048965454102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02017843723297119, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.367440700531006, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03956109285354614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.468840599060059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16900712251663208, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5474371910095215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09448770433664322, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.031232833862305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0035020275972783566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.901659965515137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10679303109645844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.296720504760742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07609548419713974, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.252513885498047, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00999253150075674, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 213.52249145507812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05698583647608757} +{"step": 985661440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.02681732177734, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1875298172235489, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.154129028320312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007581770420074463, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.6937174797058105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03659927845001221, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.559556484222412, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04265153408050537, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.509443759918213, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.28663134574890137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.487035274505615, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.30069637298583984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.859521865844727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007112159859389067, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.38453483581543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.2160167098045349, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.119847297668457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.16946014761924744, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.350675582885742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00699105579406023, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.16931438446045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02159787155687809, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.424619197845459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04477176070213318, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469488620758057, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2464676946401596, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.549327850341797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12839846312999725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02963638305664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005915816407650709, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.90610122680664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.15702921152114868, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.298893928527832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11980563402175903, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.33165740966797, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006284765899181366, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 214.3814239501953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0706809014081955} +{"step": 1006632960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.20845794677734, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13009095191955566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.147653579711914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005505772307515144, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.705916881561279, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.028851762413978577, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.568160533905029, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.033731576055288315, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.50101900100708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19438450038433075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.4776787757873535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20919181406497955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.85429573059082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00439981184899807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.38281536102295, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15046612918376923, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.118772506713867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11690819263458252, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.369050979614258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004226426128298044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.256143569946289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01739308424293995, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.479678153991699, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03189969062805176, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469901084899902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.158890500664711, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.551083087921143, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09127039462327957, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.027677536010742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0037571704015135765, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.908867835998535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11994539201259613, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.299925804138184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08525312691926956, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.408252716064453, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007354146800935268, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.19264221191406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.053593892604112625} +{"step": 1027604480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.37854766845703, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1432420164346695, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.14046859741211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005006761290132999, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.716551303863525, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.030995206907391548, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.575809955596924, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.036083683371543884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.490998268127441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18937189877033234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.4666948318481445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2057884782552719, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.849286079406738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00406738743185997, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.381149291992188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15092603862285614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.117634773254395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1176677718758583, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.38977813720703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005170173477381468, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.349451065063477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021862272173166275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.538797378540039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04518062248826027, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.470827102661133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16709111630916595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.553902626037598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09719037264585495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02503776550293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00500113470479846, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.909818649291992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12885573506355286, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.300446510314941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09745315462350845, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.485111236572266, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01052807830274105, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.97230529785156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0595334991812706} +{"step": 1048576000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.53919219970703, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15218335390090942, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.133499145507812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005849822890013456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.725360870361328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.031783346086740494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.582202434539795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.036899007856845856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.48211145401001, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2130020558834076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.457045555114746, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22109737992286682, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.84537410736084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004559183958917856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.380773544311523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1629570871591568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.117215156555176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12116105854511261, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.408309936523438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004558270797133446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.432190895080566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017104145139455795, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.590373516082764, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03319040313363075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.471346855163574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16228410601615906, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5555806159973145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09331560134887695, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02215003967285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003608575789257884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.909863471984863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1034451499581337, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.300230979919434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07503575831651688, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.556617736816406, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010822311975061893, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 216.7145538330078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04662619158625603} +{"step": 1069547520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.68988037109375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12915168702602386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.126670837402344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004544700495898724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.735968112945557, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02976476587355137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5895161628723145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.034438010305166245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.473182678222656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17953996360301971, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.447375297546387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2020108550786972, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.839967727661133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003899507224559784, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.377888679504395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14643864333629608, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.115545272827148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11390834301710129, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.42764663696289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004521127324551344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.517373085021973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018254555761814117, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.6433515548706055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03052612394094467, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.471635818481445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1397060751914978, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.557500839233398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0884394645690918, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02001953125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004380775149911642, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.910871505737305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1155758947134018, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.300888061523438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09710554033517838, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.62801170349121, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009112004190683365, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 217.42367553710938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04993504658341408} +{"step": 1090519040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.82987976074219, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1530313938856125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.11773681640625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00552500132471323, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7429070472717285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0415622740983963, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593441963195801, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04591192305088043, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.463537216186523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21034522354602814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.43678092956543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22408296167850494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.835615158081055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004557598847895861, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.376213073730469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16105341911315918, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.114715576171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12230727076530457, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.447463989257812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004424763843417168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.604172706604004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02105460688471794, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.697543621063232, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03988337889313698, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.472085475921631, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1767890602350235, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.559545993804932, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09442353993654251, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.017826080322266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0036407611332833767, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.911918640136719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11736823618412018, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.301468849182129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08901986479759216, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.698122024536133, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008917471393942833, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.0973663330078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.041790671646595} +{"step": 1111490560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.96186828613281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16542930901050568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.10944175720215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0062004635110497475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.74943208694458, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03630764037370682, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.597342491149902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04516042396426201, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.45381498336792, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23553799092769623, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.42604398727417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2512286603450775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.830854415893555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005871511530131102, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.373697280883789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18870291113853455, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.113398551940918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14485271275043488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.467819213867188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005831207614392042, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.692286491394043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.023253347724676132, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.752353668212891, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04600203409790993, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.472576141357422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2353491634130478, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.561399459838867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11908847093582153, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.015531539916992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.006268978118896484, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.912482261657715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1760002076625824, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.301706314086914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1312415599822998, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.76629638671875, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007128425873816013, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.7415313720703, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06279528886079788} +{"step": 1132462080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.0859375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1442362517118454, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.100858688354492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0050704386085271835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.754452705383301, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02534906379878521, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.600085735321045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03195352852344513, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.444334983825684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19692841172218323, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.415995121002197, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2371620088815689, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.826730728149414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005079666152596474, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.371909141540527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17588219046592712, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.112472534179688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13594749569892883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.487974166870117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0056479391641914845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.77773666381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02416338585317135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.80449914932251, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04992740973830223, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.473561763763428, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19682124257087708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.563588619232178, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10377464443445206, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.012460708618164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004541425500065088, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.910974502563477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12909109890460968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.300336837768555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10206925123929977, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.830907821655273, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006655599921941757, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.3548583984375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05856548994779587} +{"step": 1153433600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.20063781738281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16996215283870697, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.093446731567383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005566649604588747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.762857913970947, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.043905530124902725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.606022357940674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05552699416875839, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.43524169921875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21992076933383942, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.406187057495117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23500517010688782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.822336196899414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004980229772627354, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.369444847106934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17485591769218445, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.111077308654785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13284239172935486, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.508596420288086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005110130645334721, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.86392879486084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024223314598202705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.857348918914795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05114417523145676, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.474167346954346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19483177363872528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.565274715423584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09907432645559311, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.009979248046875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00414605438709259, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.909881591796875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1275695115327835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29928207397461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09543931484222412, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.89330291748047, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009982266463339329, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.943603515625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0683363676071167} +{"step": 1174405120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.30704498291016, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1389496773481369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.085079193115234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005430834833532572, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7680535316467285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.033222343772649765, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.609122276306152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04181476682424545, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.425930976867676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18500754237174988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.3960041999816895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1995944380760193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.817855834960938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004203124903142452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.366514205932617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14942125976085663, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.1091890335083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11104469001293182, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.528867721557617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004140734672546387, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.949090003967285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.022509245201945305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.909248352050781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03960441052913666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.474589824676514, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14872242510318756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.56679630279541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09464430063962936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.00760269165039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0031652648467570543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.908462524414062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10216627269983292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.298416137695312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07993075251579285, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.954477310180664, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009317560121417046, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.5013885498047, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.043171804398298264} +{"step": 1195376640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.4041976928711, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13972240686416626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.076078414916992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005051028449088335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.772785186767578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.036537788808345795, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6112236976623535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.038947150111198425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.416506767272949, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18183335661888123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.385924816131592, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20247730612754822, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.813175201416016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003937047906219959, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.362714767456055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1494070291519165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.106942176818848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11975661665201187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.550107955932617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004601023159921169, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.036116600036621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01743357814848423, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.9631733894348145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.036615584045648575, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.475347995758057, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14968904852867126, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.569120407104492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10008666664361954, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.005712509155273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004482694901525974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.907954216003418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12146040052175522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.298239707946777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10372648388147354, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.013643264770508, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006812941282987595, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.030029296875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0508958138525486} +{"step": 1216348160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.49468231201172, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1232905462384224, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.066572189331055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005103060510009527, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.774097919464111, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03037281706929207, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.610956192016602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.036481354385614395, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.406851768493652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19349238276481628, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.375216484069824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21526174247264862, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.808646202087402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004011508543044329, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.358993530273438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15961575508117676, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.105351448059082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12075138092041016, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.57200050354004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004219215363264084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.124750137329102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016341272741556168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.017531394958496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.032558899372816086, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.476412296295166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15479512512683868, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5714430809021, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08796869218349457, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.0034236907959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038175382651388645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.906782150268555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11050175875425339, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.2976655960083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0836557000875473, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.070890426635742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006613453384488821, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.5337371826172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03888819366693497} +{"step": 1237319680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.57926177978516, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10501911491155624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.05712127685547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003874940099194646, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.774757385253906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02808493562042713, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6102681159973145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03531279414892197, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.3975749015808105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.14714673161506653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.364948749542236, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17074516415596008, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.804352760314941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0035867751576006413, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.355572700500488, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13232167065143585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.103711128234863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10204103589057922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.593677520751953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003745436668395996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.211527824401855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014538493007421494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.07089900970459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.028056709095835686, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.477374076843262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11965387314558029, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.573867321014404, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07502929866313934, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.000612258911133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0024648394901305437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.904387474060059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08506294339895248, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.296608924865723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06314639747142792, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.12614631652832, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007235766854137182, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.0148162841797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04230634495615959} +{"step": 1258291200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.6566162109375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1725146323442459, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.047584533691406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006448633503168821, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.777064800262451, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.037327997386455536, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.611098289489746, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.041130710393190384, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.38805627822876, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.27024200558662415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.354701042175293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.30113697052001953, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.801176071166992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005366876721382141, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.353483200073242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.23018114268779755, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.102673530578613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.17133069038391113, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.614355087280273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0065747699700295925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.29455852508545, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02583852782845497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.122032165527344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.058954790234565735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.477745056152344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.270285964012146, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.575252056121826, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12541715800762177, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.998303413391113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.006387461442500353, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.90280818939209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.18075355887413025, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.295915603637695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.127655029296875, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.178985595703125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009582683444023132, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.47044372558594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08376245945692062} +{"step": 1279262720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.72930908203125, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13482117652893066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.038511276245117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004756717011332512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7758283615112305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03202012926340103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.608874320983887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03829754516482353, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.379901885986328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18986955285072327, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.346011161804199, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2168307602405548, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.797747611999512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004739533644169569, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.350686073303223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17291440069675446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.10099983215332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12781080603599548, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.632774353027344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0056792558170855045, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.368534088134766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021291358396410942, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.166472434997559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04092143848538399, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.478687286376953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1996115893125534, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.577221870422363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10304136574268341, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.99496841430664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004523320123553276, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.899304389953613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12801167368888855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293856620788574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10302167385816574, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.227643966674805, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010495496913790703, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.9008331298828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.051628656685352325} +{"step": 1300234240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.79573822021484, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13634543120861053, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.02988052368164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004795670974999666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.777631759643555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.038321565836668015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.609117031097412, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.050008777529001236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.371673583984375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18639540672302246, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.337297439575195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1952790468931198, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.794026374816895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003814281430095434, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.347002983093262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14539404213428497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.098892211914062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1119837537407875, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.65208625793457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004373121540993452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.444958686828613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016665956005454063, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.212554931640625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.028710724785923958, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.479564189910889, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.13905754685401917, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.579245090484619, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08439965546131134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.992490768432617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0031536798924207687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.896636962890625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09317166358232498, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29220962524414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06880170851945877, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.275402069091797, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008142548613250256, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 223.31375122070312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04104143753647804} +{"step": 1321205760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.85614013671875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11991895735263824, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.020427703857422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0044539193622767925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.776998043060303, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02833840437233448, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.607542514801025, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03240608423948288, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.362767219543457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17022235691547394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.327981948852539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20566944777965546, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.790857315063477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004575631115585566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.344206809997559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16590331494808197, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.09754467010498, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12206763029098511, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.67185401916504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004196248948574066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.522153854370117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01735599897801876, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.259506225585938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03796315938234329, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.480027198791504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14469552040100098, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.580893039703369, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08200860768556595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.990357398986816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0028597188647836447, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.894331932067871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0897209495306015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.290878295898438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07082445174455643, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.32174301147461, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008130298927426338, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 223.70407104492188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.035067733377218246} +{"step": 1342177280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.91000366210938, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12090231478214264, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.011343002319336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004405006766319275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.775251388549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.028459632769227028, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.605231285095215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03396517410874367, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.354652404785156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17536211013793945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.319432258605957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2077334076166153, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.787368774414062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0042184386402368546, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.340785026550293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1696757823228836, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.095841407775879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1312309205532074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.6910457611084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004986213985830545, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.597824096679688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01915767416357994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.305459976196289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.039950910955667496, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.480681896209717, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.164734348654747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.582650184631348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09983896464109421, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.987604141235352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005307014100253582, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.891193389892578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1360432654619217, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289443016052246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11258317530155182, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.366439819335938, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009820165112614632, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.0704345703125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04797147959470749} +{"step": 1363148800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.95914459228516, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12266255170106888, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.002254486083984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004343907814472914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.77362585067749, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03909745812416077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.603142738342285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04924899339675903, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.346553802490234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16545867919921875, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.310697555541992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20039768517017365, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.784173965454102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0039814976043999195, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.33757209777832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15900801122188568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.094215393066406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11910692602396011, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.709867477416992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0046219900250434875, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.671378135681152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016842065379023552, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.350214004516602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0317123718559742, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.481405735015869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14977602660655975, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5844221115112305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08662792295217514, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.985075950622559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0033517160918563604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.888386726379395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10169888287782669, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288142204284668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0800425186753273, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.409423828125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007079108152538538, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.42140197753906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.036449797451496124} +{"step": 1384120320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.00325775146484, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13937658071517944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.993196487426758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005096374545246363, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.773324012756348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.043791238218545914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.601828575134277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.056824665516614914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.3382487297058105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18847757577896118, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.302061080932617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23824858665466309, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.781389236450195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0052929711528122425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.334988594055176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.19357268512248993, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.09282398223877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14011070132255554, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.728116989135742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005314511712640524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.741361618041992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021390244364738464, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.39327335357666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04442291334271431, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.481631278991699, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.20855021476745605, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.585600852966309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11031462252140045, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.983283996582031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004948679357767105, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.886673927307129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.147866889834404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.287618637084961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1103755384683609, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.45058822631836, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009153001010417938, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.751220703125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.048531949520111084} +{"step": 1405091840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.04375457763672, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10768384486436844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.984864234924316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004268310498446226, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.771647930145264, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02685684710741043, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.599595546722412, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.029620308429002762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.33049201965332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16580967605113983, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.294051170349121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20322848856449127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.778889656066895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004096508026123047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.3326416015625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15958058834075928, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.091608047485352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12082671374082565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.74541473388672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004673193208873272, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.807977676391602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.015238541178405285, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.43389892578125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.029598062857985497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.481860637664795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16055616736412048, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.586698055267334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08673058450222015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.981123924255371, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00274286069907248, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.884284973144531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08789033442735672, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.286653518676758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07138265669345856, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.488908767700195, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008389650844037533, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.0595245361328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03842083737254143} +{"step": 1426063360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.07978057861328, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11424682289361954, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.97716236114502, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00437437929213047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.771548748016357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029524972662329674, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.598507881164551, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.034268077462911606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.323174953460693, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16994258761405945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.286682605743408, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20146530866622925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.776590347290039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004186541773378849, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.330121040344238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16378231346607208, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.090413093566895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12444426864385605, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.76246452331543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004725117702037096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.872718811035156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016842201352119446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.47314453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03452730178833008, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482134819030762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1788577139377594, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5878005027771, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.091860331594944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.979096412658691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004426164552569389, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.881898880004883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12158317863941193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.285569190979004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09832966327667236, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.525497436523438, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006380075588822365, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.35157775878906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.036656394600868225} +{"step": 1447034880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.11185455322266, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09643556922674179, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.968815803527832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00353591563180089, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.769958972930908, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024502743035554886, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.596492767333984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.028840720653533936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.315727710723877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.14153122901916504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.279090881347656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16785170137882233, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.774130821228027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0036296453326940536, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.327528953552246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1333700716495514, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08899974822998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10298291593790054, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.77922821044922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004024517722427845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.935436248779297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012895695865154266, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.511449813842773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023904750123620033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482416152954102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1264173686504364, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.588911533355713, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07400276511907578, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.977775573730469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0025008204393088818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.88018798828125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07650656998157501, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.285087585449219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05659816414117813, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.560285568237305, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00817380752414465, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.6243133544922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.031111562624573708} +{"step": 1468006400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.1401138305664, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10515349358320236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.960914611816406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003772149793803692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.76959228515625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.028046993538737297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.595592021942139, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03214746341109276, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.308679580688477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15451858937740326, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2718400955200195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1929660588502884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.771769523620605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004269892815500498, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.324880599975586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16194608807563782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.087786674499512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11877299845218658, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.79593276977539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004317194223403931, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.996491432189941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013264956884086132, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.54904556274414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.028858348727226257, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482839584350586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14086173474788666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.590094089508057, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0801030844449997, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.976205825805664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0023968550376594067, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.878026962280273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08035176247358322, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.284153938293457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06039009988307953, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.593238830566406, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007782191038131714, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.8785400390625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03483430668711662} +{"step": 1488977920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.16570281982422, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09484777599573135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.953831672668457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0033092147205024958, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.769753932952881, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02589098922908306, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5948805809021, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.028868084773421288, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.301962852478027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13393336534500122, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.265114784240723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1718757152557373, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.769940376281738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003635979723185301, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.322879791259766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14238610863685608, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.087044715881348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10661575198173523, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.811513900756836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003808810841292143, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.053820610046387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01486620306968689, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.584199905395508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03156864643096924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482931613922119, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12646622955799103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.590776443481445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07536423206329346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.974522590637207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0025194394402205944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.87574291229248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0765608623623848, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.283244132995605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.057897601276636124, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.62434196472168, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007869213819503784, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.1183319091797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03559652343392372} +{"step": 1509949440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.18783569335938, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09895095229148865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.946351051330566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003855254966765642, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.769243240356445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026114264503121376, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5938215255737305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03040222078561783, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.295124053955078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15116529166698456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.258347034454346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18604502081871033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.768428802490234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003968312405049801, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.321086883544922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16751302778720856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.086368560791016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12204090505838394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.826766967773438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004744339268654585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.109901428222656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012921188957989216, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.61860466003418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02957218512892723, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483029842376709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12832176685333252, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.591396808624268, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07603254169225693, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.9732084274292, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003120825393125415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.874069213867188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07983042299747467, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.282768249511719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06323586404323578, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.653589248657227, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008569438941776752, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.34408569335938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03422527387738228} +{"step": 1530920960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.20722198486328, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12200627475976944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.938793182373047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003869435051456094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.766376495361328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.030856937170028687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.590757369995117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04032910615205765, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.288541793823242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1635395586490631, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.251768112182617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20328690111637115, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.766850471496582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00417755963280797, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.319079399108887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16557273268699646, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.085467338562012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1235683336853981, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.841588973999023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004672287497669458, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.164185523986816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01630205474793911, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.652037620544434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03872593864798546, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483433246612549, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15277989208698273, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.592474460601807, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08645153045654297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.971979141235352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003018005285412073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.87232780456543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09333217144012451, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.282090187072754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07002276927232742, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.6810245513916, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007029588799923658, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.55357360839844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04109768941998482} +{"step": 1551892480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.22450256347656, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09521835297346115, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.932168006896973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003269975772127509, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.765372276306152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02851620875298977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.589493751525879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.033120330423116684, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.282537460327148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1374092847108841, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.245760440826416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16277961432933807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.765563011169434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0035057298373430967, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.317466735839844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13432134687900543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.084769248962402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10213226079940796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.855350494384766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004102506209164858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.214495658874512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013715768232941628, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.682940483093262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.026802560314536095, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483610153198242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1213112324476242, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.593172073364258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07451913505792618, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.970646858215332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0023068799637258053, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.87031078338623, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06931980699300766, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.2813720703125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.053345371037721634, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.706703186035156, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007975160144269466, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.74781799316406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03034340776503086} +{"step": 1572864000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.23927307128906, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09950844943523407, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.925477981567383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003186449408531189, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.763619422912598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026082253083586693, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.587586402893066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.029406163841485977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2766804695129395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1308256983757019, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.240030288696289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16535447537899017, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.76389217376709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0035416276659816504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.31530475616455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13898152112960815, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.083916664123535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10632926225662231, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.868274688720703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004488114267587662, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.261492729187012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01393006183207035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.711846351623535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0286336038261652, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483821868896484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11847735941410065, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.593992233276367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07393816858530045, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.969770431518555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002653158036991954, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.868964195251465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07967409491539001, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.280881881713867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06204644590616226, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.730693817138672, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008459383621811867, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.9296112060547, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.035071901977062225} +{"step": 1593835520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.25202941894531, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09232856333255768, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.919739723205566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003452804870903492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.763603210449219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.028786547482013702, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5869975090026855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0340130478143692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.270975112915039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1398683786392212, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2344536781311035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18017733097076416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.762446403503418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00440597627311945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.313411712646484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16703595221042633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.083142280578613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11935721337795258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.88054847717285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004721054807305336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.305337905883789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01333614531904459, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.739178657531738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02710065059363842, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484228134155273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12652982771396637, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.594968795776367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07731086015701294, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.968432426452637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0024864934384822845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.866867065429688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08377070724964142, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.280138969421387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06299881637096405, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.753089904785156, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0073500704020261765, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.0967254638672, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03179004788398743} +{"step": 1614807040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.26274108886719, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08658438175916672, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.914325714111328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003183323424309492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.763487815856934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023695724084973335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.586431503295898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02624283917248249, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.265836238861084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1210898756980896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.229589462280273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15328633785247803, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.76167106628418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003346254350617528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.312457084655762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.135298952460289, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.082828521728516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1005110815167427, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.89177131652832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038651577197015285, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.345854759216309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012254386208951473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.764101028442383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023382646963000298, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.48420524597168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10895175486803055, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5953168869018555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06721123307943344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.967546463012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002697671065106988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.865452766418457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07139666378498077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.279678344726562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05442569777369499, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.7736873626709, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00811512116342783, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.25064086914062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.031110865995287895} +{"step": 1635778560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.2715072631836, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0875350683927536, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.909456253051758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0031282473355531693, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7632880210876465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024403801187872887, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.585844993591309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.026675814762711525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2612175941467285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1276482194662094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2251129150390625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1673377901315689, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.7608642578125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0036077320110052824, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.311263084411621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14436137676239014, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.082265853881836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10651878267526627, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.90247917175293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004144931677728891, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.384313583374023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012342647649347782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.788177490234375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.026689814403653145, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484062671661377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12570792436599731, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.595541000366211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06956897675991058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966672897338867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0018918202258646488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.864063262939453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07081413269042969, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.279229164123535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05379701405763626, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.792659759521484, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007124023977667093, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.39178466796875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.029650745913386345} +{"step": 1656750080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.27922058105469, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09254438430070877, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.904539108276367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003336805384606123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.762028217315674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027185305953025818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5843682289123535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03481695055961609, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.256831645965576, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12190473079681396, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.220854759216309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15241697430610657, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.759993553161621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037048859521746635, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.309988021850586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12559665739536285, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081741333007812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09545998275279999, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.912424087524414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00396141828969121, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.419631958007812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011592473834753036, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.8102445602417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.022709235548973083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483968257904053, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09833499789237976, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.595712661743164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06581036001443863, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966239929199219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0020239863079041243, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.863275527954102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06943506747484207, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.279192924499512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.057496801018714905, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.810047149658203, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007932266220450401, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.5195770263672, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027520110830664635} +{"step": 1677721600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.28539276123047, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08636458963155746, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.899662971496582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003126367926597595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.760288238525391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024855680763721466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.582627296447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.027737988159060478, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.252439975738525, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12269458174705505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.216618537902832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16236846148967743, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75913143157959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0033640353940427303, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.308691024780273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13228566944599152, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081232070922852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0991339161992073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.921649932861328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004153966438025236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.451708793640137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01168576255440712, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.830418586730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02336796186864376, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484172821044922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10992501676082611, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596226215362549, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06737766414880753, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96570110321045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0022445989307016134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.862350463867188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07366536557674408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.279023170471191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05659611150622368, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.826017379760742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006904190871864557, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.6360321044922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02801278606057167} +{"step": 1698693120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29056549072266, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08222708851099014, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.895790100097656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00249351910315454, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.759908199310303, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023136410862207413, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.582042217254639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023873064666986465, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2487077713012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11523997038602829, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.213020324707031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14642122387886047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.758261680603027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00312162097543478, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.30738639831543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1208762526512146, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080615043640137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0938158929347992, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.930072784423828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003327560843899846, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.481230735778809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010941431857645512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.849016189575195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02035794034600258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484273433685303, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10035126656293869, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596695423126221, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06405512988567352, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.965225219726562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.001969854813069105, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.861434936523438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06273269653320312, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278889656066895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04952383041381836, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.84054946899414, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007304046303033829, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.74229431152344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02697214111685753} +{"step": 1719664640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29474639892578, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08053798973560333, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.891907691955566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002470981329679489, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.759742259979248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023749781772494316, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.581693172454834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.025702321901917458, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2449798583984375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1044728085398674, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.209481239318848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.13520775735378265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757659912109375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002936718286946416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.306357383728027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11352123320102692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080192565917969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08533827215433121, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.937862396240234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00327911414206028, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.508423805236816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010717427358031273, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.866223335266113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.022361518815159798, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484370708465576, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09447772800922394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.597064971923828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06116807833313942, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.964820861816406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0021805083379149437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.86057186126709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06694060564041138, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278719902038574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05399807170033455, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.85369873046875, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006996613927185535, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.83810424804688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02577265165746212} +{"step": 1740636160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29800415039062, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.081877201795578, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.888463973999023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002999324584379792, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.759363174438477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02293705940246582, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5811686515808105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.024795982986688614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.241754055023193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11824977397918701, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.206352710723877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14239290356636047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.756993293762207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0031191955786198378, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.305212020874023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12769387662410736, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.079758644104004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09314040094614029, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.945098876953125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038116585928946733, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.533265113830566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010478634387254715, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.882036209106445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020312123000621796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.48447322845459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0976739302277565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.597483158111572, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.060745541006326675, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.964506149291992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002410557819530368, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.859861373901367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06309107691049576, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278739929199219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.048804331570863724, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.865558624267578, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007771092001348734, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.9233856201172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.024335267022252083} +{"step": 1761607680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30035400390625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08262743800878525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.885125160217285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0028705678414553404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.758144378662109, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025603797286748886, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.579853534698486, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.026411747559905052, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.238664627075195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11193875223398209, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2034010887146, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14460991322994232, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.756580352783203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0031800272408872843, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.304454803466797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12638451159000397, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.079514503479004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09481709450483322, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.951675415039062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00387512962333858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.55579948425293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011679599992930889, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.896529197692871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.022278184071183205, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484588623046875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10039163380861282, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.597900867462158, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06081024557352066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.964058876037598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002134174108505249, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.858975410461426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06369268894195557, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278526306152344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.050261903554201126, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.876115798950195, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00790554191917181, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.9988250732422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027416495606303215} +{"step": 1782579200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3022232055664, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07878340035676956, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.882458686828613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0026580956764519215, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.757852077484131, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024758415296673775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.579380035400391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.026024416089057922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.235978603363037, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09764336794614792, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.200839042663574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12805365025997162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.756014823913574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0028751695062965155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.303569793701172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11211875826120377, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.079211235046387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08606603741645813, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.957571029663086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00339848967269063, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.575996398925781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01123578380793333, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.909613609313965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.021218450739979744, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484692096710205, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0901428684592247, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598194122314453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05997391417622566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96374797821045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0017485524294897914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85832691192627, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.058948058634996414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278374671936035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.045911990106105804, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.88545036315918, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007743470370769501, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.06591796875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02843712456524372} +{"step": 1803550720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30377960205078, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07831688970327377, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.879840850830078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002502014860510826, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.757473468780518, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02166070230305195, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.578831195831299, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023803748190402985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.233503818511963, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09940718114376068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.198501110076904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12493475526571274, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.755669593811035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002736730268225074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.302896499633789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10707197338342667, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078957557678223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08011666685342789, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.962697982788086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0034294608049094677, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.59328556060791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009824363514780998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.92090892791748, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019727937877178192, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484716415405273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0828806459903717, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598348140716553, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05447838082909584, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963591575622559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0016649888129904866, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.857891082763672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.055146828293800354, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278375625610352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.042283881455659866, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.89365577697754, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007827048189938068, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.12445068359375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02535591460764408} +{"step": 1824522240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3048095703125, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07851530611515045, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.877779006958008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0026471782475709915, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.75724983215332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026824600994586945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.578531265258789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.032215796411037445, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2314372062683105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09744122624397278, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.1965556144714355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12569354474544525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.755425453186035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002450918545946479, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.302347183227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10444208234548569, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078822135925293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08043142408132553, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.967266082763672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033551061060279608, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.608429908752441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010893681086599827, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.930834770202637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0196553822606802, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484781265258789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.079957515001297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598560333251953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05540608614683151, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963278770446777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019197971560060978, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.857274055480957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05586980655789375, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278229713439941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04382964223623276, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.90082550048828, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007744465954601765, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.17526245117188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025466160848736763} +{"step": 1845493760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3055191040039, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08312857151031494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.875743865966797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002425978658720851, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.756936550140381, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024503257125616074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.578189373016357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.027739034965634346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.229462623596191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09918131679296494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.194693088531494, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.13067613542079926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.755264282226562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002749767154455185, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.301908493041992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11620441824197769, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078758239746094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08527550846338272, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.971351623535156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038126367144286633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.621872901916504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012455493211746216, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.939682006835938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.027869809418916702, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484829425811768, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09697607159614563, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.59871768951416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05745967477560043, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962936401367188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019880745094269514, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.856589317321777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06177029758691788, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.2780179977417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04937376081943512, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.9069881439209, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0067091165110468864, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.21864318847656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027007214725017548} +{"step": 1866465280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3060073852539, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07229671627283096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.87409782409668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002802190138027072, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.756860256195068, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02230265364050865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.578115463256836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0255195964127779, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.227826118469238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09386494010686874, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.193133354187012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12417662888765335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.755011558532715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0027267930563539267, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.30142879486084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11084408313035965, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07857608795166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08197030425071716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.974937438964844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003290374530479312, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.633666038513184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0098643247038126, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.947480201721191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019475556910037994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484907150268555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08682455122470856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598877906799316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.057562194764614105, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962820053100586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019881045445799828, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.856203079223633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.062115494161844254, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.278019905090332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04690086469054222, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.9122314453125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006404906511306763, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.2553253173828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.024462366476655006} +{"step": 1887436800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30631256103516, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07300835847854614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.872553825378418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0025588776916265488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.756596088409424, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.022003786638379097, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5778093338012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02348657324910164, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.226372718811035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09442364424467087, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.191746234893799, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12190824747085571, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754941940307617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002718583447858691, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.301188468933105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11032214760780334, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078518867492676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08091019093990326, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.977943420410156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003409226192161441, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.643482208251953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01071534026414156, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.953962326049805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020809590816497803, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484946250915527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07750813663005829, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.59900426864624, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05501068755984306, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962650299072266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014580566203221679, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.855767250061035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.053613703697919846, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277923583984375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04218744859099388, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.916606903076172, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007581314537674189, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.2860870361328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.024909593164920807} +{"step": 1908408320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30642700195312, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07813630998134613, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.87141227722168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002712781075388193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.756415367126465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02107904851436615, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.577613830566406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023881910368800163, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2252421379089355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1019112840294838, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.190654754638672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12356527894735336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75479793548584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00257381284609437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300873756408691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10082991421222687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078407287597656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07776675373315811, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.98050308227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0031012576073408127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.651812553405762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009856068529188633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.959497451782227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017461923882365227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4850358963012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08590149134397507, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599184989929199, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0544663704931736, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962437629699707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0015302525134757161, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85533332824707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.050285402685403824, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277786254882812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03906504064798355, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.920209884643555, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007248177193105221, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3114776611328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02320844680070877} +{"step": 1929379840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30650329589844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07277852296829224, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.870351791381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024970327503979206, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.756181240081787, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023260075598955154, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.577356815338135, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02378997579216957, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.224234104156494, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08632039278745651, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.189698696136475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.11263101547956467, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754719734191895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0023791345302015543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300680160522461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09842508286237717, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078360557556152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07363726943731308, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.982534408569336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0032222559675574303, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.658417701721191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009104901924729347, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.963911056518555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017363008111715317, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485048770904541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06896000355482101, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599253177642822, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05146187171339989, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962347984313965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0013808069052174687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.855083465576172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.050734587013721466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277729034423828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.039501190185546875, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.92310333251953, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007040908560156822, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.331787109375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0234835147857666} +{"step": 1950351360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3065185546875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07062280178070068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.869465827941895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002152391243726015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.755838394165039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02298651449382305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5770134925842285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.025092096999287605, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.223417282104492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08398991078138351, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.188920021057129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10388249903917313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754668235778809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0021027985494583845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300535202026367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09221577644348145, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078324317932129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07049643248319626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.98409652709961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0029708293732255697, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.6634521484375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009111653082072735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.967288970947266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017464138567447662, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485046863555908, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06993067264556885, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599282264709473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.04923270642757416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962360382080078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014208569191396236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85496997833252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.049721598625183105, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277751922607422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03688906133174896, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.92535400390625, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007262997329235077, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.34742736816406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.023838866502046585} +{"step": 1971322880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30650329589844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06931479275226593, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.868871688842773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0022191177122294903, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.755641937255859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.021981528028845787, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.576808929443359, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02310802973806858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.222832679748535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.0874566063284874, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.188366413116455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10981310904026031, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754595756530762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0022246893495321274, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300374031066895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.08949960023164749, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07827377319336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06940118968486786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.98531723022461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003293000627309084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.667318344116211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.00915166549384594, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.969895362854004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.01605633646249771, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485080242156982, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07326427102088928, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599356174468994, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05126631632447243, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962315559387207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014689292293041945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.854818344116211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.048536963760852814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277735710144043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.037022680044174194, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.92705535888672, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008121767081320286, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3592071533203, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02403034083545208} +{"step": 1992294400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30643463134766, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07331487536430359, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.868467330932617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002402650658041239, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7555952072143555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023821616545319557, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.576748371124268, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02778724953532219, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.222424507141113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08504357188940048, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.187974452972412, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10530304908752441, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754514694213867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0020479520317167044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300212860107422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09413424134254456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078210830688477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06940726935863495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.986215591430664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033154315315186977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.670133590698242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009239386767148972, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.971803665161133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017198024317622185, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485109329223633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06561122089624405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5994157791137695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0472404919564724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962270736694336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0011963029392063618, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.854696273803711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04639764875173569, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277726173400879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.035126861184835434, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.928285598754883, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007185902446508408, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.36767578125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02265845239162445} +{"step": 2013265920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30640411376953, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06888440251350403, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.868151664733887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024223162326961756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.755580902099609, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02356106787919998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.576724529266357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02332456223666668, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.222118377685547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08130747824907303, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.187692642211914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10179462283849716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75448226928711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0021927712950855494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300126075744629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09009759873151779, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078184127807617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06720911711454391, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.986820220947266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.002971093403175473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.672033309936523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.008968408219516277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.973082542419434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.016696641221642494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485131740570068, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06142694130539894, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599458694458008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.04639425128698349, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.962223052978516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.001166749862022698, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.854583740234375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04609224200248718, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277700424194336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03625890240073204, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.929109573364258, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00717343483120203, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3733367919922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.022993512451648712} +{"step": 2034237440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3063735961914, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07084300369024277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.867961883544922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0022411069367080927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.755569934844971, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.022764230147004128, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5767059326171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.024176178500056267, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.221927165985107, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.07990248501300812, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.1875176429748535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10108041763305664, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754472732543945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0021007563918828964, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300093650817871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.08944126963615417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078178405761719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0671757161617279, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.98719024658203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033461106941103935, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.67318058013916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009400050155818462, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.973857879638672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.016913022845983505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485141754150391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0642218217253685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599478244781494, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.046929579228162766, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.9622163772583, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0011202897876501083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.854545593261719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.045008737593889236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277700424194336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.034035418182611465, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.929615020751953, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007590645924210548, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.37681579589844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02300846204161644} +{"step": 2055208960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30635070800781, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06830166280269623, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.867866516113281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002308495808392763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.755560874938965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.022658104076981544, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.576695442199707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02350487746298313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.221829414367676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08114005625247955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.187427043914795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.0978272408246994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75446605682373, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002138023264706135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.30007266998291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.08474022150039673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078170776367188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06427700072526932, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.987384796142578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.002977714641019702, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.673791885375977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009160446003079414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.974275588989258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.016123000532388687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485145568847656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06108659878373146, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599485397338867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.04569607600569725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96220588684082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0012808861210942268, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.854514122009277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04590999335050583, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277693748474121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.035198014229536057, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.929874420166016, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007705077063292265, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.37863159179688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02321181818842888} diff --git a/metrics/jsonlines/resume.jsonl b/metrics/jsonlines/resume.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..92733e5e5f3807c0d022c001b1110b1f10275c5c --- /dev/null +++ b/metrics/jsonlines/resume.jsonl @@ -0,0 +1 @@ +{"step": 0, "resume/resume_step": 0} diff --git a/metrics/jsonlines/throughput.jsonl b/metrics/jsonlines/throughput.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..78b8d90edb5ba7d6edaa107bdc38a476de67e82d --- /dev/null +++ b/metrics/jsonlines/throughput.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "throughput/token_count": 20971520, "throughput/batch_count": 10, "throughput/flop_count": 0, "throughput/total_time": 59.51673407689668, "throughput/update_time": 59.329752838937566, "throughput/token_count_per_second_total_recent": 374995.30176031415, "throughput/token_count_per_second_total_cum": 352363.4205617604, "throughput/token_count_per_second_update_recent": 375959.60157094675, "throughput/token_count_per_second_update_cum": 353473.9147983874, "throughput/batch_count_per_second_total_recent": 0.17881169403091152, "throughput/batch_count_per_second_total_cum": 0.1680199721154024, "throughput/batch_count_per_second_update_recent": 0.17927150801226938, "throughput/batch_count_per_second_update_cum": 0.16854949703139657, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 41943040, "throughput/token_count": 41943040, "throughput/batch_count": 20, "throughput/flop_count": 0, "throughput/total_time": 115.38901872490533, "throughput/update_time": 115.08477980294265, "throughput/token_count_per_second_total_recent": 375180.5645944244, "throughput/token_count_per_second_total_cum": 363492.4749641458, "throughput/token_count_per_second_update_recent": 376052.867370802, "throughput/token_count_per_second_update_cum": 364453.4061916634, "throughput/batch_count_per_second_total_recent": 0.1789000342342493, "throughput/batch_count_per_second_total_cum": 0.17332671879012385, "throughput/batch_count_per_second_update_recent": 0.17931598061122989, "throughput/batch_count_per_second_update_cum": 0.17378492650588198, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 62914560, "throughput/token_count": 62914560, "throughput/batch_count": 30, "throughput/flop_count": 0, "throughput/total_time": 262.1596394549124, "throughput/update_time": 170.83132880181074, "throughput/token_count_per_second_total_recent": 240408.59283058238, "throughput/token_count_per_second_total_cum": 239985.682505565, "throughput/token_count_per_second_update_recent": 376101.53880809934, "throughput/token_count_per_second_update_cum": 368284.67261406174, "throughput/batch_count_per_second_total_recent": 0.11463575021294707, "throughput/batch_count_per_second_total_cum": 0.11443409085539102, "throughput/batch_count_per_second_update_recent": 0.17933918896107642, "throughput/batch_count_per_second_update_cum": 0.17561181669905745, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 83886080, "throughput/token_count": 83886080, "throughput/batch_count": 40, "throughput/flop_count": 0, "throughput/total_time": 318.01136298198253, "throughput/update_time": 226.56994250579737, "throughput/token_count_per_second_total_recent": 264837.4413424087, "throughput/token_count_per_second_total_cum": 263783.27872753626, "throughput/token_count_per_second_update_recent": 376138.9812689484, "throughput/token_count_per_second_update_cum": 370243.6389939657, "throughput/batch_count_per_second_total_recent": 0.12628433291550098, "throughput/batch_count_per_second_total_cum": 0.12578166900994123, "throughput/batch_count_per_second_update_recent": 0.17935704291770382, "throughput/batch_count_per_second_update_cum": 0.17654592466066632, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 104857600, "throughput/token_count": 104857600, "throughput/batch_count": 50, "throughput/flop_count": 0, "throughput/total_time": 465.39904637495056, "throughput/update_time": 282.29461103421636, "throughput/token_count_per_second_total_recent": 225245.86028441804, "throughput/token_count_per_second_total_cum": 225306.86475777836, "throughput/token_count_per_second_update_recent": 376180.34755834244, "throughput/token_count_per_second_update_cum": 371447.40247021726, "throughput/batch_count_per_second_total_recent": 0.10740559591504004, "throughput/batch_count_per_second_total_cum": 0.10743468511475485, "throughput/batch_count_per_second_update_recent": 0.17937676790158388, "throughput/batch_count_per_second_update_cum": 0.17711992381583083, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 125829120, "throughput/token_count": 125829120, "throughput/batch_count": 60, "throughput/flop_count": 0, "throughput/total_time": 521.2494050179375, "throughput/update_time": 338.0269747101702, "throughput/token_count_per_second_total_recent": 241633.3477073923, "throughput/token_count_per_second_total_cum": 241399.06691245027, "throughput/token_count_per_second_update_recent": 376198.8943952416, "throughput/token_count_per_second_update_cum": 372245.7951998888, "throughput/batch_count_per_second_total_recent": 0.11521975884790053, "throughput/batch_count_per_second_total_cum": 0.11510804505941881, "throughput/batch_count_per_second_update_recent": 0.17938561172258452, "throughput/batch_count_per_second_update_cum": 0.17750062713617745, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 146800640, "throughput/token_count": 146800640, "throughput/batch_count": 70, "throughput/flop_count": 0, "throughput/total_time": 667.8293691409053, "throughput/update_time": 393.75072771217674, "throughput/token_count_per_second_total_recent": 219698.78609724584, "throughput/token_count_per_second_total_cum": 219817.58632275206, "throughput/token_count_per_second_update_recent": 376220.48879840254, "throughput/token_count_per_second_update_cum": 372826.3331802858, "throughput/batch_count_per_second_total_recent": 0.10476054482328694, "throughput/batch_count_per_second_total_cum": 0.10481719318521121, "throughput/batch_count_per_second_update_recent": 0.1793959087364209, "throughput/batch_count_per_second_update_cum": 0.17777744921697894, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 167772160, "throughput/token_count": 167772160, "throughput/batch_count": 80, "throughput/flop_count": 0, "throughput/total_time": 723.6805678269593, "throughput/update_time": 449.4893446461065, "throughput/token_count_per_second_total_recent": 231876.7102262385, "throughput/token_count_per_second_total_cum": 231831.7880273888, "throughput/token_count_per_second_update_recent": 376223.9184529116, "throughput/token_count_per_second_update_cum": 373250.58313026524, "throughput/batch_count_per_second_total_recent": 0.11056743155776906, "throughput/batch_count_per_second_total_cum": 0.11054601098412933, "throughput/batch_count_per_second_update_recent": 0.1793975441231306, "throughput/batch_count_per_second_update_cum": 0.17797974735749494, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 188743680, "throughput/token_count": 188743680, "throughput/batch_count": 90, "throughput/flop_count": 0, "throughput/total_time": 871.0406558898976, "throughput/update_time": 505.2065281631658, "throughput/token_count_per_second_total_recent": 216563.42196256324, "throughput/token_count_per_second_total_cum": 216687.56644564457, "throughput/token_count_per_second_update_recent": 376242.83252498537, "throughput/token_count_per_second_update_cum": 373597.07263925485, "throughput/batch_count_per_second_total_recent": 0.10326548669937288, "throughput/batch_count_per_second_total_cum": 0.10332468340189198, "throughput/batch_count_per_second_update_recent": 0.1794065630555083, "throughput/batch_count_per_second_update_cum": 0.17814496643030875, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 209715200, "throughput/token_count": 209715200, "throughput/batch_count": 100, "throughput/flop_count": 0, "throughput/total_time": 926.8863875919487, "throughput/update_time": 560.9345708230976, "throughput/token_count_per_second_total_recent": 226236.90845091513, "throughput/token_count_per_second_total_cum": 226257.71918480773, "throughput/token_count_per_second_update_recent": 376250.52247975284, "throughput/token_count_per_second_update_cum": 373867.48991468037, "throughput/batch_count_per_second_total_recent": 0.10787816450639492, "throughput/batch_count_per_second_total_cum": 0.1078880878376044, "throughput/batch_count_per_second_update_recent": 0.17941022991168634, "throughput/batch_count_per_second_update_cum": 0.17827391143545168, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 230686720, "throughput/token_count": 230686720, "throughput/batch_count": 110, "throughput/flop_count": 0, "throughput/total_time": 1073.8704686219571, "throughput/update_time": 616.6548248290783, "throughput/token_count_per_second_total_recent": 205815.58651052113, "throughput/token_count_per_second_total_cum": 214818.01273111498, "throughput/token_count_per_second_update_recent": 376292.66037302185, "throughput/token_count_per_second_update_cum": 374093.7566878533, "throughput/batch_count_per_second_total_recent": 0.09814051938558632, "throughput/batch_count_per_second_total_cum": 0.1024332107215476, "throughput/batch_count_per_second_update_recent": 0.1794303228249654, "throughput/batch_count_per_second_update_cum": 0.17838180384056726, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 251658240, "throughput/token_count": 251658240, "throughput/batch_count": 120, "throughput/flop_count": 0, "throughput/total_time": 1129.7145008929074, "throughput/update_time": 672.3896088181064, "throughput/token_count_per_second_total_recent": 226206.2112924804, "throughput/token_count_per_second_total_cum": 222762.6889812369, "throughput/token_count_per_second_update_recent": 376303.6444561435, "throughput/token_count_per_second_update_cum": 374274.43360160274, "throughput/batch_count_per_second_total_recent": 0.107863526960602, "throughput/batch_count_per_second_total_cum": 0.10622152756749959, "throughput/batch_count_per_second_update_recent": 0.1794355604439466, "throughput/batch_count_per_second_update_cum": 0.17846795730667245, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 272629760, "throughput/token_count": 272629760, "throughput/batch_count": 130, "throughput/flop_count": 0, "throughput/total_time": 1276.169682028005, "throughput/update_time": 728.117079011281, "throughput/token_count_per_second_total_recent": 205884.25492074105, "throughput/token_count_per_second_total_cum": 213631.27790871408, "throughput/token_count_per_second_update_recent": 376315.8920842829, "throughput/token_count_per_second_update_cum": 374431.21148896444, "throughput/batch_count_per_second_total_recent": 0.09817326303517392, "throughput/batch_count_per_second_total_cum": 0.10186733146129326, "throughput/batch_count_per_second_update_recent": 0.1794414005681433, "throughput/batch_count_per_second_update_cum": 0.1785427148289511, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 293601280, "throughput/token_count": 293601280, "throughput/batch_count": 140, "throughput/flop_count": 0, "throughput/total_time": 1332.0184532779967, "throughput/update_time": 783.8552743501496, "throughput/token_count_per_second_total_recent": 226441.23026558387, "throughput/token_count_per_second_total_cum": 220418.32774723912, "throughput/token_count_per_second_update_recent": 376316.19724369614, "throughput/token_count_per_second_update_cum": 374560.5720946489, "throughput/batch_count_per_second_total_recent": 0.10797559273986047, "throughput/batch_count_per_second_total_cum": 0.10510364901887852, "throughput/batch_count_per_second_update_recent": 0.17944154607949073, "throughput/batch_count_per_second_update_cum": 0.1786043987725491, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 314572800, "throughput/token_count": 314572800, "throughput/batch_count": 150, "throughput/flop_count": 0, "throughput/total_time": 1478.8752941149287, "throughput/update_time": 839.5983157731825, "throughput/token_count_per_second_total_recent": 205994.0604950742, "throughput/token_count_per_second_total_cum": 212710.8358979411, "throughput/token_count_per_second_update_recent": 376304.24777001113, "throughput/token_count_per_second_update_cum": 374670.59436667786, "throughput/batch_count_per_second_total_recent": 0.0982256224131938, "throughput/batch_count_per_second_total_cum": 0.10142843050858551, "throughput/batch_count_per_second_update_recent": 0.17943584812641675, "throughput/batch_count_per_second_update_cum": 0.17865686148008245, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 335544320, "throughput/token_count": 335544320, "throughput/batch_count": 160, "throughput/flop_count": 0, "throughput/total_time": 1534.7253116948996, "throughput/update_time": 895.3389847053913, "throughput/token_count_per_second_total_recent": 226374.77568927762, "throughput/token_count_per_second_total_cum": 218634.77290892924, "throughput/token_count_per_second_update_recent": 376298.01619008137, "throughput/token_count_per_second_update_cum": 374767.90995581396, "throughput/batch_count_per_second_total_recent": 0.10794390472854501, "throughput/batch_count_per_second_total_cum": 0.1042531837982794, "throughput/batch_count_per_second_update_recent": 0.17943287667755192, "throughput/batch_count_per_second_update_cum": 0.17870326516905496, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 356515840, "throughput/token_count": 356515840, "throughput/batch_count": 170, "throughput/flop_count": 0, "throughput/total_time": 1681.113392906962, "throughput/update_time": 951.0781835562084, "throughput/token_count_per_second_total_recent": 206033.34258770983, "throughput/token_count_per_second_total_cum": 212071.26271447813, "throughput/token_count_per_second_update_recent": 376287.2633610761, "throughput/token_count_per_second_update_cum": 374854.39805478416, "throughput/batch_count_per_second_total_recent": 0.09824435357461445, "throughput/batch_count_per_second_total_cum": 0.10112345824932009, "throughput/batch_count_per_second_update_recent": 0.17942774932912642, "throughput/batch_count_per_second_update_cum": 0.1787445059083863, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 377487360, "throughput/token_count": 377487360, "throughput/batch_count": 180, "throughput/flop_count": 0, "throughput/total_time": 1736.965916060959, "throughput/update_time": 1006.818993799272, "throughput/token_count_per_second_total_recent": 226617.46355150297, "throughput/token_count_per_second_total_cum": 217325.71520807667, "throughput/token_count_per_second_update_recent": 376284.52467120043, "throughput/token_count_per_second_update_cum": 374930.70981461747, "throughput/batch_count_per_second_total_recent": 0.10805962731909893, "throughput/batch_count_per_second_total_cum": 0.10362897644428094, "throughput/batch_count_per_second_update_recent": 0.1794264434200289, "throughput/batch_count_per_second_update_cum": 0.1787808941910827, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 398458880, "throughput/token_count": 398458880, "throughput/batch_count": 190, "throughput/flop_count": 0, "throughput/total_time": 1883.3845960129984, "throughput/update_time": 1062.5558878729353, "throughput/token_count_per_second_total_recent": 206226.29389288812, "throughput/token_count_per_second_total_cum": 211565.32810319852, "throughput/token_count_per_second_update_recent": 376272.23472675163, "throughput/token_count_per_second_update_cum": 375000.3972004241, "throughput/batch_count_per_second_total_recent": 0.09833635992664724, "throughput/batch_count_per_second_total_cum": 0.10088220982704092, "throughput/batch_count_per_second_update_recent": 0.17942058311784345, "throughput/batch_count_per_second_update_cum": 0.17881412372609334, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 419430400, "throughput/token_count": 419430400, "throughput/batch_count": 200, "throughput/flop_count": 0, "throughput/total_time": 1939.2464721049182, "throughput/update_time": 1118.2928310850402, "throughput/token_count_per_second_total_recent": 226752.54835490198, "throughput/token_count_per_second_total_cum": 216285.24585878826, "throughput/token_count_per_second_update_recent": 376266.45717900456, "throughput/token_count_per_second_update_cum": 375063.12151982717, "throughput/batch_count_per_second_total_recent": 0.10812404077286815, "throughput/batch_count_per_second_total_cum": 0.1031328419965688, "throughput/batch_count_per_second_update_recent": 0.17941782816839436, "throughput/batch_count_per_second_update_cum": 0.17884403301230772, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 440401920, "throughput/token_count": 440401920, "throughput/batch_count": 210, "throughput/flop_count": 0, "throughput/total_time": 2086.0689364429563, "throughput/update_time": 1174.0314252841054, "throughput/token_count_per_second_total_recent": 206256.4730523848, "throughput/token_count_per_second_total_cum": 211115.70778238412, "throughput/token_count_per_second_update_recent": 376255.55816126487, "throughput/token_count_per_second_update_cum": 375119.36266393086, "throughput/batch_count_per_second_total_recent": 0.0983507504712986, "throughput/batch_count_per_second_total_cum": 0.1006678141509934, "throughput/batch_count_per_second_update_recent": 0.17941263111174816, "throughput/batch_count_per_second_update_cum": 0.17887085087963622, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 461373440, "throughput/token_count": 461373440, "throughput/batch_count": 220, "throughput/flop_count": 0, "throughput/total_time": 2141.922117186943, "throughput/update_time": 1229.7717256471515, "throughput/token_count_per_second_total_recent": 226657.95255491446, "throughput/token_count_per_second_total_cum": 215401.5948095895, "throughput/token_count_per_second_update_recent": 376250.2524249545, "throughput/token_count_per_second_update_cum": 375169.9851102108, "throughput/batch_count_per_second_total_recent": 0.10807893398042415, "throughput/batch_count_per_second_total_cum": 0.10271148434142566, "throughput/batch_count_per_second_update_recent": 0.17941010113952374, "throughput/batch_count_per_second_update_cum": 0.17889498954306163, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 482344960, "throughput/token_count": 482344960, "throughput/batch_count": 230, "throughput/flop_count": 0, "throughput/total_time": 2288.3046315449756, "throughput/update_time": 1285.5049123089993, "throughput/token_count_per_second_total_recent": 206268.38712383743, "throughput/token_count_per_second_total_cum": 210787.0400429768, "throughput/token_count_per_second_update_recent": 376247.43348418304, "throughput/token_count_per_second_update_cum": 375218.29390260455, "throughput/batch_count_per_second_total_recent": 0.0983564315432727, "throughput/batch_count_per_second_total_cum": 0.10051109316014137, "throughput/batch_count_per_second_update_recent": 0.17940875696381714, "throughput/batch_count_per_second_update_cum": 0.1789180249703429, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 503316480, "throughput/token_count": 503316480, "throughput/batch_count": 240, "throughput/flop_count": 0, "throughput/total_time": 2344.147989144898, "throughput/update_time": 1341.2354510270525, "throughput/token_count_per_second_total_recent": 226771.4711686419, "throughput/token_count_per_second_total_cum": 214711.9048501714, "throughput/token_count_per_second_update_recent": 376252.2290719448, "throughput/token_count_per_second_update_cum": 375263.328757516, "throughput/batch_count_per_second_total_recent": 0.1081330638735971, "throughput/batch_count_per_second_total_cum": 0.10238261454113551, "throughput/batch_count_per_second_update_recent": 0.17941104367825736, "throughput/batch_count_per_second_update_cum": 0.17893949926257896, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 524288000, "throughput/token_count": 524288000, "throughput/batch_count": 250, "throughput/flop_count": 0, "throughput/total_time": 2491.0224517869065, "throughput/update_time": 1396.97127348301, "throughput/token_count_per_second_total_recent": 206267.42015399478, "throughput/token_count_per_second_total_cum": 210471.00543951662, "throughput/token_count_per_second_update_recent": 376257.54590563447, "throughput/token_count_per_second_update_cum": 375303.35086477094, "throughput/batch_count_per_second_total_recent": 0.09835597045612086, "throughput/batch_count_per_second_total_cum": 0.10036039611793357, "throughput/batch_count_per_second_update_recent": 0.1794135789421246, "throughput/batch_count_per_second_update_cum": 0.1789585832904677, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 545259520, "throughput/token_count": 545259520, "throughput/batch_count": 260, "throughput/flop_count": 0, "throughput/total_time": 2546.883552026935, "throughput/update_time": 1452.7241989910835, "throughput/token_count_per_second_total_recent": 226652.27236432474, "throughput/token_count_per_second_total_cum": 214088.9086059925, "throughput/token_count_per_second_update_recent": 376248.92370497744, "throughput/token_count_per_second_update_cum": 375335.88301116106, "throughput/batch_count_per_second_total_recent": 0.1080762254544853, "throughput/batch_count_per_second_total_cum": 0.10208554678249002, "throughput/batch_count_per_second_update_recent": 0.17940946755646583, "throughput/batch_count_per_second_update_cum": 0.17897409582670262, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 566231040, "throughput/token_count": 566231040, "throughput/batch_count": 270, "throughput/flop_count": 0, "throughput/total_time": 2693.2529326969525, "throughput/update_time": 1508.4631838970818, "throughput/token_count_per_second_total_recent": 206268.19906819658, "throughput/token_count_per_second_total_cum": 210240.57307272332, "throughput/token_count_per_second_update_recent": 376249.8728227274, "throughput/token_count_per_second_update_cum": 375369.47937778267, "throughput/batch_count_per_second_total_recent": 0.09835634187135534, "throughput/batch_count_per_second_total_cum": 0.10025051740299383, "throughput/batch_count_per_second_update_recent": 0.17940992013107654, "throughput/batch_count_per_second_update_cum": 0.17899011582268842, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 587202560, "throughput/token_count": 587202560, "throughput/batch_count": 280, "throughput/flop_count": 0, "throughput/total_time": 2749.1005825489992, "throughput/update_time": 1564.2029820160242, "throughput/token_count_per_second_total_recent": 226664.19972842748, "throughput/token_count_per_second_total_cum": 213598.06320929105, "throughput/token_count_per_second_update_recent": 376248.671294421, "throughput/token_count_per_second_update_cum": 375400.4862228197, "throughput/batch_count_per_second_total_recent": 0.1080819128648889, "throughput/batch_count_per_second_total_cum": 0.10185149345840981, "throughput/batch_count_per_second_update_recent": 0.1794093471977334, "throughput/batch_count_per_second_update_cum": 0.17900490103856073, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 608174080, "throughput/token_count": 608174080, "throughput/batch_count": 290, "throughput/flop_count": 0, "throughput/total_time": 2895.4641819539247, "throughput/update_time": 1619.943224380957, "throughput/token_count_per_second_total_recent": 206279.91430439826, "throughput/token_count_per_second_total_cum": 210043.72417744444, "throughput/token_count_per_second_update_recent": 376247.36325255316, "throughput/token_count_per_second_update_cum": 375429.2563138482, "throughput/batch_count_per_second_total_recent": 0.09836192813129342, "throughput/batch_count_per_second_total_cum": 0.10015665253517363, "throughput/batch_count_per_second_update_recent": 0.17940872347476633, "throughput/batch_count_per_second_update_cum": 0.179018619687008, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 629145600, "throughput/token_count": 629145600, "throughput/batch_count": 300, "throughput/flop_count": 0, "throughput/total_time": 2951.315687590977, "throughput/update_time": 1675.687991371029, "throughput/token_count_per_second_total_recent": 226778.2005006984, "throughput/token_count_per_second_total_cum": 213174.61993147284, "throughput/token_count_per_second_update_recent": 376242.3645412903, "throughput/token_count_per_second_update_cum": 375455.09858624707, "throughput/batch_count_per_second_total_recent": 0.10813627266917153, "throughput/batch_count_per_second_total_cum": 0.10164957996915476, "throughput/batch_count_per_second_update_recent": 0.17940633990349308, "throughput/batch_count_per_second_update_cum": 0.1790309422427402, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 650117120, "throughput/token_count": 650117120, "throughput/batch_count": 310, "throughput/flop_count": 0, "throughput/total_time": 3098.1222598329186, "throughput/update_time": 1731.4270116091939, "throughput/token_count_per_second_total_recent": 206285.42311403464, "throughput/token_count_per_second_total_cum": 209842.30623457086, "throughput/token_count_per_second_update_recent": 376242.0306674801, "throughput/token_count_per_second_update_cum": 375480.52308354544, "throughput/batch_count_per_second_total_recent": 0.09836455493642551, "throughput/batch_count_per_second_total_cum": 0.10006060897568267, "throughput/batch_count_per_second_update_recent": 0.1794061807000542, "throughput/batch_count_per_second_update_cum": 0.1790430655877807, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 671088640, "throughput/token_count": 671088640, "throughput/batch_count": 320, "throughput/flop_count": 0, "throughput/total_time": 3153.9731954459567, "throughput/update_time": 1787.1771466861246, "throughput/token_count_per_second_total_recent": 226677.4141234897, "throughput/token_count_per_second_total_cum": 212775.63200885456, "throughput/token_count_per_second_update_recent": 376234.5102099859, "throughput/token_count_per_second_update_cum": 375502.02633486385, "throughput/batch_count_per_second_total_recent": 0.10808821397947774, "throughput/batch_count_per_second_total_cum": 0.10145932770197609, "throughput/batch_count_per_second_update_recent": 0.1794025946664743, "throughput/batch_count_per_second_update_cum": 0.17905331913703149, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 692060160, "throughput/token_count": 692060160, "throughput/batch_count": 330, "throughput/flop_count": 0, "throughput/total_time": 3300.597545653931, "throughput/update_time": 1842.9211868423736, "throughput/token_count_per_second_total_recent": 206236.43392945835, "throughput/token_count_per_second_total_cum": 209677.23281236508, "throughput/token_count_per_second_update_recent": 376227.0404462016, "throughput/token_count_per_second_update_cum": 375523.4705319997, "throughput/batch_count_per_second_total_recent": 0.09834119507286947, "throughput/batch_count_per_second_total_cum": 0.0999818958341432, "throughput/batch_count_per_second_update_recent": 0.17939903280553893, "throughput/batch_count_per_second_update_cum": 0.1790635445270537, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 713031680, "throughput/token_count": 713031680, "throughput/batch_count": 340, "throughput/flop_count": 0, "throughput/total_time": 3356.450910591986, "throughput/update_time": 1898.6706604874926, "throughput/token_count_per_second_total_recent": 226734.7838172686, "throughput/token_count_per_second_total_cum": 212436.20091385182, "throughput/token_count_per_second_update_recent": 376213.8280861725, "throughput/token_count_per_second_update_cum": 375542.58083754545, "throughput/batch_count_per_second_total_recent": 0.10811556998122625, "throughput/batch_count_per_second_total_cum": 0.10129747434322921, "throughput/batch_count_per_second_update_recent": 0.17939273266132952, "throughput/batch_count_per_second_update_cum": 0.17907265703084252, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 734003200, "throughput/token_count": 734003200, "throughput/batch_count": 350, "throughput/flop_count": 0, "throughput/total_time": 3502.8775023539783, "throughput/update_time": 1954.4126074366504, "throughput/token_count_per_second_total_recent": 206325.84718120762, "throughput/token_count_per_second_total_cum": 209542.92563948937, "throughput/token_count_per_second_update_recent": 376209.6145092638, "throughput/token_count_per_second_update_cum": 375562.04723970586, "throughput/batch_count_per_second_total_recent": 0.09838383063373929, "throughput/batch_count_per_second_total_cum": 0.09991785318350285, "throughput/batch_count_per_second_update_recent": 0.17939072347129048, "throughput/batch_count_per_second_update_cum": 0.17908193933472913, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 754974720, "throughput/token_count": 754974720, "throughput/batch_count": 360, "throughput/flop_count": 0, "throughput/total_time": 3558.7265775619308, "throughput/update_time": 2010.1547237539198, "throughput/token_count_per_second_total_recent": 226723.79593391906, "throughput/token_count_per_second_total_cum": 212147.43632179525, "throughput/token_count_per_second_update_recent": 376217.9635561935, "throughput/token_count_per_second_update_cum": 375580.40238320624, "throughput/batch_count_per_second_total_recent": 0.10811033055015519, "throughput/batch_count_per_second_total_cum": 0.10115978065576327, "throughput/batch_count_per_second_update_recent": 0.17939470460710216, "throughput/batch_count_per_second_update_cum": 0.17909069174919426, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 775946240, "throughput/token_count": 775946240, "throughput/batch_count": 370, "throughput/flop_count": 0, "throughput/total_time": 3705.2644805479795, "throughput/update_time": 2065.915084464941, "throughput/token_count_per_second_total_recent": 206294.49293474862, "throughput/token_count_per_second_total_cum": 209417.2343360611, "throughput/token_count_per_second_update_recent": 376203.33859298454, "throughput/token_count_per_second_update_cum": 375594.45005018933, "throughput/batch_count_per_second_total_recent": 0.09836887976396018, "throughput/batch_count_per_second_total_cum": 0.09985791889956527, "throughput/batch_count_per_second_update_recent": 0.17938773088120677, "throughput/batch_count_per_second_update_cum": 0.17909739019879786, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 796917760, "throughput/token_count": 796917760, "throughput/batch_count": 380, "throughput/flop_count": 0, "throughput/total_time": 3761.109741097898, "throughput/update_time": 2121.6608164007775, "throughput/token_count_per_second_total_recent": 226682.6096218226, "throughput/token_count_per_second_total_cum": 211883.67658939227, "throughput/token_count_per_second_update_recent": 376197.91620934306, "throughput/token_count_per_second_update_cum": 375610.3491376653, "throughput/batch_count_per_second_total_recent": 0.10809069138613825, "throughput/batch_count_per_second_total_cum": 0.10103401021451582, "throughput/batch_count_per_second_update_recent": 0.17938514528720048, "throughput/batch_count_per_second_update_cum": 0.17910497147448792, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 817889280, "throughput/token_count": 817889280, "throughput/batch_count": 390, "throughput/flop_count": 0, "throughput/total_time": 3907.6489149519475, "throughput/update_time": 2177.411766545032, "throughput/token_count_per_second_total_recent": 206257.8924563716, "throughput/token_count_per_second_total_cum": 209304.699014921, "throughput/token_count_per_second_update_recent": 376191.68774762284, "throughput/token_count_per_second_update_cum": 375624.533938186, "throughput/batch_count_per_second_total_recent": 0.098351427295862, "throughput/batch_count_per_second_total_cum": 0.09980425787683535, "throughput/batch_count_per_second_update_recent": 0.17938217532521383, "throughput/batch_count_per_second_update_cum": 0.1791117353144579, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 838860800, "throughput/token_count": 838860800, "throughput/batch_count": 400, "throughput/flop_count": 0, "throughput/total_time": 3963.4901722619543, "throughput/update_time": 2233.1504810712067, "throughput/token_count_per_second_total_recent": 226752.24296382652, "throughput/token_count_per_second_total_cum": 211646.99886748154, "throughput/token_count_per_second_update_recent": 376196.6492027777, "throughput/token_count_per_second_update_cum": 375640.06864311796, "throughput/batch_count_per_second_total_recent": 0.10812389515105558, "throughput/batch_count_per_second_total_cum": 0.10092115348218991, "throughput/batch_count_per_second_update_recent": 0.17938454113139043, "throughput/batch_count_per_second_update_cum": 0.17911914283901118, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 859832320, "throughput/token_count": 859832320, "throughput/batch_count": 410, "throughput/flop_count": 0, "throughput/total_time": 4110.402928694966, "throughput/update_time": 2288.885604837211, "throughput/token_count_per_second_total_recent": 206238.01766090124, "throughput/token_count_per_second_total_cum": 209184.43639611572, "throughput/token_count_per_second_update_recent": 376199.3162705958, "throughput/token_count_per_second_update_cum": 375655.43607023236, "throughput/batch_count_per_second_total_recent": 0.09834195025487005, "throughput/batch_count_per_second_total_cum": 0.09974691219144617, "throughput/batch_count_per_second_update_recent": 0.17938581288842956, "throughput/batch_count_per_second_update_cum": 0.1791264705992853, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 880803840, "throughput/token_count": 880803840, "throughput/batch_count": 420, "throughput/flop_count": 0, "throughput/total_time": 4166.242571576964, "throughput/update_time": 2344.6231456701644, "throughput/token_count_per_second_total_recent": 226680.20409541504, "throughput/token_count_per_second_total_cum": 211414.43995821083, "throughput/token_count_per_second_update_recent": 376207.0136791428, "throughput/token_count_per_second_update_cum": 375669.68560665625, "throughput/batch_count_per_second_total_recent": 0.10808954434176209, "throughput/batch_count_per_second_total_cum": 0.10081026075277845, "throughput/batch_count_per_second_update_recent": 0.17938948329884663, "throughput/batch_count_per_second_update_cum": 0.17913326530773938, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 901775360, "throughput/token_count": 901775360, "throughput/batch_count": 430, "throughput/flop_count": 0, "throughput/total_time": 4312.675729307928, "throughput/update_time": 2400.370607822202, "throughput/token_count_per_second_total_recent": 206280.43980718192, "throughput/token_count_per_second_total_cum": 209098.809324278, "throughput/token_count_per_second_update_recent": 376205.3386927121, "throughput/token_count_per_second_update_cum": 375681.7205898713, "throughput/batch_count_per_second_total_recent": 0.09836217871054741, "throughput/batch_count_per_second_total_cum": 0.09970608202184582, "throughput/batch_count_per_second_update_recent": 0.17938868460307697, "throughput/batch_count_per_second_update_cum": 0.17913900403493466, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 922746880, "throughput/token_count": 922746880, "throughput/batch_count": 440, "throughput/flop_count": 0, "throughput/total_time": 4368.526084997924, "throughput/update_time": 2456.1160258102464, "throughput/token_count_per_second_total_recent": 226681.93385464879, "throughput/token_count_per_second_total_cum": 211226.1348670506, "throughput/token_count_per_second_update_recent": 376208.93704375444, "throughput/token_count_per_second_update_cum": 375693.52192781516, "throughput/batch_count_per_second_total_recent": 0.10809036915523948, "throughput/batch_count_per_second_total_cum": 0.10072046988823442, "throughput/batch_count_per_second_update_recent": 0.17939040043056223, "throughput/batch_count_per_second_update_cum": 0.1791446313513828, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 943718400, "throughput/token_count": 943718400, "throughput/batch_count": 450, "throughput/flop_count": 0, "throughput/total_time": 4515.121015235898, "throughput/update_time": 2511.850276188343, "throughput/token_count_per_second_total_recent": 206247.1101380335, "throughput/token_count_per_second_total_cum": 209012.86960316263, "throughput/token_count_per_second_update_recent": 376213.2760660889, "throughput/token_count_per_second_update_cum": 375706.4698267224, "throughput/batch_count_per_second_total_recent": 0.09834628588582682, "throughput/batch_count_per_second_total_cum": 0.0996651027694524, "throughput/batch_count_per_second_update_recent": 0.17939246943764156, "throughput/batch_count_per_second_update_cum": 0.17915080539070244, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 964689920, "throughput/token_count": 964689920, "throughput/batch_count": 460, "throughput/flop_count": 0, "throughput/total_time": 4570.982028090977, "throughput/update_time": 2567.596293253242, "throughput/token_count_per_second_total_recent": 226662.3018127446, "throughput/token_count_per_second_total_cum": 211046.53531155814, "throughput/token_count_per_second_update_recent": 376211.86857260595, "throughput/token_count_per_second_update_cum": 375717.13377795124, "throughput/batch_count_per_second_total_recent": 0.10808100786816817, "throughput/batch_count_per_second_total_cum": 0.10063483014657885, "throughput/batch_count_per_second_update_recent": 0.17939179829244897, "throughput/batch_count_per_second_update_cum": 0.17915589035890161, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 985661440, "throughput/token_count": 985661440, "throughput/batch_count": 470, "throughput/flop_count": 0, "throughput/total_time": 4717.434516418958, "throughput/update_time": 2623.332695179037, "throughput/token_count_per_second_total_recent": 206261.701015483, "throughput/token_count_per_second_total_cum": 208940.14247986284, "throughput/token_count_per_second_update_recent": 376227.406755904, "throughput/token_count_per_second_update_cum": 375728.7216415113, "throughput/batch_count_per_second_total_recent": 0.09835324335836554, "throughput/batch_count_per_second_total_cum": 0.09963042377465384, "throughput/batch_count_per_second_update_recent": 0.17939920747561647, "throughput/batch_count_per_second_update_cum": 0.17916141588283124, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1006632960, "throughput/token_count": 1006632960, "throughput/batch_count": 480, "throughput/flop_count": 0, "throughput/total_time": 4773.292612125981, "throughput/update_time": 2679.07335356588, "throughput/token_count_per_second_total_recent": 226681.69456754628, "throughput/token_count_per_second_total_cum": 210888.59238228324, "throughput/token_count_per_second_update_recent": 376231.59648975555, "throughput/token_count_per_second_update_cum": 375739.23037984717, "throughput/batch_count_per_second_total_recent": 0.10809025505425753, "throughput/batch_count_per_second_total_cum": 0.10055951708902514, "throughput/batch_count_per_second_update_recent": 0.17940120529639986, "throughput/batch_count_per_second_update_cum": 0.17916642683975562, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1027604480, "throughput/token_count": 1027604480, "throughput/batch_count": 490, "throughput/flop_count": 0, "throughput/total_time": 4919.765502344933, "throughput/update_time": 2734.8176154628163, "throughput/token_count_per_second_total_recent": 206272.26239718296, "throughput/token_count_per_second_total_cum": 208872.65450156265, "throughput/token_count_per_second_update_recent": 376234.6816012784, "throughput/token_count_per_second_update_cum": 375748.8156394288, "throughput/batch_count_per_second_total_recent": 0.09835827941760204, "throughput/batch_count_per_second_total_cum": 0.0995982429988683, "throughput/batch_count_per_second_update_recent": 0.17940267639221116, "throughput/batch_count_per_second_update_cum": 0.17917099744769516, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1048576000, "throughput/token_count": 1048576000, "throughput/batch_count": 500, "throughput/flop_count": 0, "throughput/total_time": 4975.63103498891, "throughput/update_time": 2790.5577287059277, "throughput/token_count_per_second_total_recent": 226784.16790750166, "throughput/token_count_per_second_total_cum": 210742.31441727816, "throughput/token_count_per_second_update_recent": 376233.9936863183, "throughput/token_count_per_second_update_cum": 375758.5765789761, "throughput/batch_count_per_second_total_recent": 0.10813911815047343, "throughput/batch_count_per_second_total_cum": 0.10048976631988438, "throughput/batch_count_per_second_update_recent": 0.17940234836879648, "throughput/batch_count_per_second_update_cum": 0.1791756518263703, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1069547520, "throughput/token_count": 1069547520, "throughput/batch_count": 510, "throughput/flop_count": 0, "throughput/total_time": 5122.565231077955, "throughput/update_time": 2846.292529964936, "throughput/token_count_per_second_total_recent": 206262.37614222307, "throughput/token_count_per_second_total_cum": 208791.39098340622, "throughput/token_count_per_second_update_recent": 376234.1221336591, "throughput/token_count_per_second_update_cum": 375768.65650319366, "throughput/batch_count_per_second_total_recent": 0.0983535652838817, "throughput/batch_count_per_second_total_cum": 0.099559493533805, "throughput/batch_count_per_second_update_recent": 0.17940240961726145, "throughput/batch_count_per_second_update_cum": 0.1791804583087891, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1090519040, "throughput/token_count": 1090519040, "throughput/batch_count": 520, "throughput/flop_count": 0, "throughput/total_time": 5178.418338532909, "throughput/update_time": 2902.025444978848, "throughput/token_count_per_second_total_recent": 226659.0509930993, "throughput/token_count_per_second_total_cum": 210589.21251791983, "throughput/token_count_per_second_update_recent": 376236.9927193287, "throughput/token_count_per_second_update_cum": 375778.59349470603, "throughput/batch_count_per_second_total_recent": 0.10807945775656667, "throughput/batch_count_per_second_total_cum": 0.10041676164527885, "throughput/batch_count_per_second_update_recent": 0.17940377841917451, "throughput/batch_count_per_second_update_cum": 0.17918519663558294, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1111490560, "throughput/token_count": 1111490560, "throughput/batch_count": 530, "throughput/flop_count": 0, "throughput/total_time": 5324.87164538994, "throughput/update_time": 2957.746671211091, "throughput/token_count_per_second_total_recent": 206256.69573082394, "throughput/token_count_per_second_total_cum": 208735.65299217755, "throughput/token_count_per_second_update_recent": 376255.9232055369, "throughput/token_count_per_second_update_cum": 375789.6410867685, "throughput/batch_count_per_second_total_recent": 0.09835085665265271, "throughput/batch_count_per_second_total_cum": 0.09953291558846357, "throughput/batch_count_per_second_update_recent": 0.17941280517842145, "throughput/batch_count_per_second_update_cum": 0.17919046453798698, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1132462080, "throughput/token_count": 1132462080, "throughput/batch_count": 540, "throughput/flop_count": 0, "throughput/total_time": 5380.720167659922, "throughput/update_time": 3013.4729040842503, "throughput/token_count_per_second_total_recent": 226693.96913719093, "throughput/token_count_per_second_total_cum": 210466.63731121115, "throughput/token_count_per_second_update_recent": 376267.7791772063, "throughput/token_count_per_second_update_cum": 375799.6557610125, "throughput/batch_count_per_second_total_recent": 0.10809610802516505, "throughput/batch_count_per_second_total_cum": 0.10035831323204572, "throughput/batch_count_per_second_update_recent": 0.17941845854626, "throughput/batch_count_per_second_update_cum": 0.17919523990679384, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1153433600, "throughput/token_count": 1153433600, "throughput/batch_count": 550, "throughput/flop_count": 0, "throughput/total_time": 5527.108590850956, "throughput/update_time": 3069.2099333773367, "throughput/token_count_per_second_total_recent": 206300.11262917487, "throughput/token_count_per_second_total_cum": 208686.61815497585, "throughput/token_count_per_second_update_recent": 376267.40470161964, "throughput/token_count_per_second_update_cum": 375807.9848030369, "throughput/batch_count_per_second_total_recent": 0.09837155944308036, "throughput/batch_count_per_second_total_cum": 0.09950953395603936, "throughput/batch_count_per_second_update_recent": 0.17941827998238546, "throughput/batch_count_per_second_update_cum": 0.1791992115035233, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1174405120, "throughput/token_count": 1174405120, "throughput/batch_count": 560, "throughput/flop_count": 0, "throughput/total_time": 5582.958667394938, "throughput/update_time": 3124.937779762433, "throughput/token_count_per_second_total_recent": 226710.9507140944, "throughput/token_count_per_second_total_cum": 210355.3312795684, "throughput/token_count_per_second_update_recent": 376279.0774395511, "throughput/token_count_per_second_update_cum": 375817.1210977781, "throughput/batch_count_per_second_total_recent": 0.1081042054720375, "throughput/batch_count_per_second_total_cum": 0.10030523838022633, "throughput/batch_count_per_second_update_recent": 0.17942384597756916, "throughput/batch_count_per_second_update_cum": 0.1792035680283442, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1195376640, "throughput/token_count": 1195376640, "throughput/batch_count": 570, "throughput/flop_count": 0, "throughput/total_time": 5729.297242018976, "throughput/update_time": 3180.655704040313, "throughput/token_count_per_second_total_recent": 206324.77602170964, "throughput/token_count_per_second_total_cum": 208642.80373394542, "throughput/token_count_per_second_update_recent": 376291.63057076675, "throughput/token_count_per_second_update_cum": 375827.1096370289, "throughput/batch_count_per_second_total_recent": 0.09838331986508829, "throughput/batch_count_per_second_total_cum": 0.09948864161202689, "throughput/batch_count_per_second_update_recent": 0.17942983177698457, "throughput/batch_count_per_second_update_cum": 0.17920833093501515, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1216348160, "throughput/token_count": 1216348160, "throughput/batch_count": 580, "throughput/flop_count": 0, "throughput/total_time": 5785.135669226991, "throughput/update_time": 3236.387389887357, "throughput/token_count_per_second_total_recent": 226752.42873429647, "throughput/token_count_per_second_total_cum": 210254.04235032023, "throughput/token_count_per_second_update_recent": 376304.23121849954, "throughput/token_count_per_second_update_cum": 375835.15613757697, "throughput/batch_count_per_second_total_recent": 0.10812398373331855, "throughput/batch_count_per_second_total_cum": 0.10025694005504619, "throughput/batch_count_per_second_update_recent": 0.179435840234041, "throughput/batch_count_per_second_update_cum": 0.17921216780546997, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1237319680, "throughput/token_count": 1237319680, "throughput/batch_count": 590, "throughput/flop_count": 0, "throughput/total_time": 5931.5355288069695, "throughput/update_time": 3292.107882911223, "throughput/token_count_per_second_total_recent": 206343.675752233, "throughput/token_count_per_second_total_cum": 208600.23074815274, "throughput/token_count_per_second_update_recent": 376313.9763711322, "throughput/token_count_per_second_update_cum": 375844.2080293656, "throughput/batch_count_per_second_total_recent": 0.09839233195888185, "throughput/batch_count_per_second_total_cum": 0.09946834123046529, "throughput/batch_count_per_second_update_recent": 0.1794404870849286, "throughput/batch_count_per_second_update_cum": 0.17921648408382682, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1258291200, "throughput/token_count": 1258291200, "throughput/batch_count": 600, "throughput/flop_count": 0, "throughput/total_time": 5987.366702231928, "throughput/update_time": 3347.8448207870824, "throughput/token_count_per_second_total_recent": 226890.20023063064, "throughput/token_count_per_second_total_cum": 210157.6974617144, "throughput/token_count_per_second_update_recent": 376317.2482639011, "throughput/token_count_per_second_update_cum": 375851.112389425, "throughput/batch_count_per_second_total_recent": 0.10818967830211193, "throughput/batch_count_per_second_total_cum": 0.10021099923215598, "throughput/batch_count_per_second_update_recent": 0.17944204724497848, "throughput/batch_count_per_second_update_cum": 0.17921977633925676, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1279262720, "throughput/token_count": 1279262720, "throughput/batch_count": 610, "throughput/flop_count": 0, "throughput/total_time": 6134.471666511963, "throughput/update_time": 3403.5747455290984, "throughput/token_count_per_second_total_recent": 206316.28786559863, "throughput/token_count_per_second_total_cum": 208536.73951800706, "throughput/token_count_per_second_update_recent": 376319.65958069457, "throughput/token_count_per_second_update_cum": 375858.56508085405, "throughput/batch_count_per_second_total_recent": 0.09837927239684993, "throughput/batch_count_per_second_total_cum": 0.0994380662527118, "throughput/batch_count_per_second_update_recent": 0.1794431970504258, "throughput/batch_count_per_second_update_cum": 0.17922333005945876, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1300234240, "throughput/token_count": 1300234240, "throughput/batch_count": 620, "throughput/flop_count": 0, "throughput/total_time": 6190.319736288977, "throughput/update_time": 3459.312256404897, "throughput/token_count_per_second_total_recent": 226735.70689425577, "throughput/token_count_per_second_total_cum": 210043.14726713535, "throughput/token_count_per_second_update_recent": 376316.17279504624, "throughput/token_count_per_second_update_cum": 375864.95338564005, "throughput/batch_count_per_second_total_recent": 0.10811601013863362, "throughput/batch_count_per_second_total_cum": 0.1001563774429013, "throughput/batch_count_per_second_update_recent": 0.179441534421466, "throughput/batch_count_per_second_update_cum": 0.17922637624055865, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1321205760, "throughput/token_count": 1321205760, "throughput/batch_count": 630, "throughput/flop_count": 0, "throughput/total_time": 6336.646818120964, "throughput/update_time": 3515.0587793228915, "throughput/token_count_per_second_total_recent": 206342.43910214253, "throughput/token_count_per_second_total_cum": 208502.35115230604, "throughput/token_count_per_second_update_recent": 376298.4891565321, "throughput/token_count_per_second_update_cum": 375870.17542122153, "throughput/batch_count_per_second_total_recent": 0.09839174227816702, "throughput/batch_count_per_second_total_cum": 0.09942166860213568, "throughput/batch_count_per_second_update_recent": 0.17943310220553021, "throughput/batch_count_per_second_update_cum": 0.17922886630116536, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1342177280, "throughput/token_count": 1342177280, "throughput/batch_count": 640, "throughput/flop_count": 0, "throughput/total_time": 6392.506873650011, "throughput/update_time": 3570.8129316339036, "throughput/token_count_per_second_total_recent": 226742.79528067404, "throughput/token_count_per_second_total_cum": 209961.02257354948, "throughput/token_count_per_second_update_recent": 376278.4049630039, "throughput/token_count_per_second_update_cum": 375874.43131215987, "throughput/batch_count_per_second_total_recent": 0.10811939014466955, "throughput/batch_count_per_second_total_cum": 0.10011721733739351, "throughput/batch_count_per_second_update_recent": 0.17942352531576342, "throughput/batch_count_per_second_update_cum": 0.17923089566810602, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1363148800, "throughput/token_count": 1363148800, "throughput/batch_count": 650, "throughput/flop_count": 0, "throughput/total_time": 6538.855644405005, "throughput/update_time": 3626.554348448757, "throughput/token_count_per_second_total_recent": 206349.26093980935, "throughput/token_count_per_second_total_cum": 208469.01570099397, "throughput/token_count_per_second_update_recent": 376277.55451698625, "throughput/token_count_per_second_update_cum": 375879.87633029156, "throughput/batch_count_per_second_total_recent": 0.09839499518385379, "throughput/batch_count_per_second_total_cum": 0.09940577302026461, "throughput/batch_count_per_second_update_recent": 0.17942311979150116, "throughput/batch_count_per_second_update_cum": 0.1792334920550783, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1384120320, "throughput/token_count": 1384120320, "throughput/batch_count": 660, "throughput/flop_count": 0, "throughput/total_time": 6594.719191052951, "throughput/update_time": 3682.304503756459, "throughput/token_count_per_second_total_recent": 226741.37856242442, "throughput/token_count_per_second_total_cum": 209883.13223068463, "throughput/token_count_per_second_update_recent": 376260.8199351674, "throughput/token_count_per_second_update_cum": 375884.2644838324, "throughput/batch_count_per_second_total_recent": 0.10811871460076543, "throughput/batch_count_per_second_total_cum": 0.10008007632765037, "throughput/batch_count_per_second_update_recent": 0.17941514012106294, "throughput/batch_count_per_second_update_cum": 0.17923558448974247, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1405091840, "throughput/token_count": 1405091840, "throughput/batch_count": 670, "throughput/flop_count": 0, "throughput/total_time": 6741.161298968946, "throughput/update_time": 3738.0453335597413, "throughput/token_count_per_second_total_recent": 206324.36168415638, "throughput/token_count_per_second_total_cum": 208434.6862038307, "throughput/token_count_per_second_update_recent": 376245.76847444964, "throughput/token_count_per_second_update_cum": 375889.4594951129, "throughput/batch_count_per_second_total_recent": 0.09838312229354686, "throughput/batch_count_per_second_total_cum": 0.09938940344039474, "throughput/batch_count_per_second_update_recent": 0.17940796302530748, "throughput/batch_count_per_second_update_cum": 0.1792380616641583, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1426063360, "throughput/token_count": 1426063360, "throughput/batch_count": 680, "throughput/flop_count": 0, "throughput/total_time": 6797.037191045936, "throughput/update_time": 3793.7997314956738, "throughput/token_count_per_second_total_recent": 226722.26725367134, "throughput/token_count_per_second_total_cum": 209806.61425225416, "throughput/token_count_per_second_update_recent": 376229.6975581697, "throughput/token_count_per_second_update_cum": 375893.1575014336, "throughput/batch_count_per_second_total_recent": 0.10810960161861007, "throughput/batch_count_per_second_total_cum": 0.10004358971226414, "throughput/batch_count_per_second_update_recent": 0.17940029981525885, "throughput/batch_count_per_second_update_cum": 0.1792398250109833, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1447034880, "throughput/token_count": 1447034880, "throughput/batch_count": 690, "throughput/flop_count": 0, "throughput/total_time": 6943.39617751597, "throughput/update_time": 3849.5505883715814, "throughput/token_count_per_second_total_recent": 206324.07110078738, "throughput/token_count_per_second_total_cum": 208404.48146769626, "throughput/token_count_per_second_update_recent": 376209.69594974106, "throughput/token_count_per_second_update_cum": 375897.094162391, "throughput/batch_count_per_second_total_recent": 0.09838298373259896, "throughput/batch_count_per_second_total_cum": 0.09937500069985211, "throughput/batch_count_per_second_update_recent": 0.17939076230513623, "throughput/batch_count_per_second_update_cum": 0.179241702157207, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1468006400, "throughput/token_count": 1468006400, "throughput/batch_count": 700, "throughput/flop_count": 0, "throughput/total_time": 6999.246100232936, "throughput/update_time": 3905.29758988251, "throughput/token_count_per_second_total_recent": 226900.2310344972, "throughput/token_count_per_second_total_cum": 209737.78875286932, "throughput/token_count_per_second_update_recent": 376203.57581390534, "throughput/token_count_per_second_update_cum": 375901.28952097724, "throughput/batch_count_per_second_total_recent": 0.10819446136212216, "throughput/batch_count_per_second_total_cum": 0.10001077115672556, "throughput/batch_count_per_second_update_recent": 0.1793878439969565, "throughput/batch_count_per_second_update_cum": 0.17924370266007292, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1488977920, "throughput/token_count": 1488977920, "throughput/batch_count": 710, "throughput/flop_count": 0, "throughput/total_time": 7145.973933565896, "throughput/update_time": 3961.0490365702426, "throughput/token_count_per_second_total_recent": 206397.82743840673, "throughput/token_count_per_second_total_cum": 208365.9881553736, "throughput/token_count_per_second_update_recent": 376188.48331037763, "throughput/token_count_per_second_update_cum": 375904.94494086417, "throughput/batch_count_per_second_total_recent": 0.09841815349502885, "throughput/batch_count_per_second_total_cum": 0.09935664565819435, "throughput/batch_count_per_second_update_recent": 0.1793806473304642, "throughput/batch_count_per_second_update_cum": 0.17924544570010384, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1509949440, "throughput/token_count": 1509949440, "throughput/batch_count": 720, "throughput/flop_count": 0, "throughput/total_time": 7201.8562647239305, "throughput/update_time": 4016.812845747103, "throughput/token_count_per_second_total_recent": 226785.687487652, "throughput/token_count_per_second_total_cum": 209661.1463069627, "throughput/token_count_per_second_update_recent": 376171.06123817933, "throughput/token_count_per_second_update_cum": 375907.3419611011, "throughput/batch_count_per_second_total_recent": 0.10813984274275398, "throughput/batch_count_per_second_total_cum": 0.09997422519062171, "throughput/batch_count_per_second_update_recent": 0.17937233983906714, "throughput/batch_count_per_second_update_cum": 0.17924658868842178, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1530920960, "throughput/token_count": 1530920960, "throughput/batch_count": 730, "throughput/flop_count": 0, "throughput/total_time": 7348.130815029959, "throughput/update_time": 4072.5528116449714, "throughput/token_count_per_second_total_recent": 206403.18955312343, "throughput/token_count_per_second_total_cum": 208341.54950924867, "throughput/token_count_per_second_update_recent": 376175.74431565945, "throughput/token_count_per_second_update_cum": 375911.8741499231, "throughput/batch_count_per_second_total_recent": 0.09842071035057232, "throughput/batch_count_per_second_total_cum": 0.0993449924036258, "throughput/batch_count_per_second_update_recent": 0.1793745729044244, "throughput/batch_count_per_second_update_cum": 0.1792487498044601, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1551892480, "throughput/token_count": 1551892480, "throughput/batch_count": 740, "throughput/flop_count": 0, "throughput/total_time": 7404.001054160995, "throughput/update_time": 4128.298564241966, "throughput/token_count_per_second_total_recent": 226803.23910230718, "throughput/token_count_per_second_total_cum": 209601.8718322369, "throughput/token_count_per_second_update_recent": 376181.84410051623, "throughput/token_count_per_second_update_cum": 375915.75702445756, "throughput/batch_count_per_second_total_recent": 0.10814821200480804, "throughput/batch_count_per_second_total_cum": 0.09994596091853948, "throughput/batch_count_per_second_update_recent": 0.17937748150850116, "throughput/batch_count_per_second_update_cum": 0.17925060130331877, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1572864000, "throughput/token_count": 1572864000, "throughput/batch_count": 750, "throughput/flop_count": 0, "throughput/total_time": 7550.323625019984, "throughput/update_time": 4184.042193662957, "throughput/token_count_per_second_total_recent": 206405.7131060023, "throughput/token_count_per_second_total_cum": 208317.42824743316, "throughput/token_count_per_second_update_recent": 376178.7727247623, "throughput/token_count_per_second_update_cum": 375919.7271916185, "throughput/batch_count_per_second_total_recent": 0.09842191367435565, "throughput/batch_count_per_second_total_cum": 0.09933349048968942, "throughput/batch_count_per_second_update_recent": 0.1793760169624149, "throughput/batch_count_per_second_update_cum": 0.1792524944265454, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1593835520, "throughput/token_count": 1593835520, "throughput/batch_count": 760, "throughput/flop_count": 0, "throughput/total_time": 7606.181935134926, "throughput/update_time": 4239.792019490269, "throughput/token_count_per_second_total_recent": 226835.75905519634, "throughput/token_count_per_second_total_cum": 209544.75367433173, "throughput/token_count_per_second_update_recent": 376179.12280448695, "throughput/token_count_per_second_update_cum": 375923.0435533533, "throughput/batch_count_per_second_total_recent": 0.1081637187267286, "throughput/batch_count_per_second_total_cum": 0.09991872485844218, "throughput/batch_count_per_second_update_recent": 0.17937618389343593, "throughput/batch_count_per_second_update_cum": 0.17925407579105057, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1614807040, "throughput/token_count": 1614807040, "throughput/batch_count": 770, "throughput/flop_count": 0, "throughput/total_time": 7752.467774236924, "throughput/update_time": 4295.5320715362905, "throughput/token_count_per_second_total_recent": 206438.31809117348, "throughput/token_count_per_second_total_cum": 208295.87262088884, "throughput/token_count_per_second_update_recent": 376181.01603459753, "throughput/token_count_per_second_update_cum": 375927.12919088197, "throughput/batch_count_per_second_total_recent": 0.09843746094282793, "throughput/batch_count_per_second_total_cum": 0.09932321196598475, "throughput/batch_count_per_second_update_recent": 0.17937708665590169, "throughput/batch_count_per_second_update_cum": 0.1792560239748392, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1635778560, "throughput/token_count": 1635778560, "throughput/batch_count": 780, "throughput/flop_count": 0, "throughput/total_time": 7808.319425186957, "throughput/update_time": 4351.274368522107, "throughput/token_count_per_second_total_recent": 226855.89290841733, "throughput/token_count_per_second_total_cum": 209491.7575635469, "throughput/token_count_per_second_update_recent": 376189.0577994389, "throughput/token_count_per_second_update_cum": 375930.9162008981, "throughput/batch_count_per_second_total_recent": 0.10817331929608218, "throughput/batch_count_per_second_total_cum": 0.09989345434357973, "throughput/batch_count_per_second_update_recent": 0.17938092126819558, "throughput/batch_count_per_second_update_cum": 0.17925782976193338, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1656750080, "throughput/token_count": 1656750080, "throughput/batch_count": 790, "throughput/flop_count": 0, "throughput/total_time": 7954.606591358897, "throughput/update_time": 4407.000014613499, "throughput/token_count_per_second_total_recent": 206459.16798252938, "throughput/token_count_per_second_total_cum": 208275.55215612176, "throughput/token_count_per_second_update_recent": 376207.088258499, "throughput/token_count_per_second_update_cum": 375936.02779810736, "throughput/batch_count_per_second_total_recent": 0.09844740294577092, "throughput/batch_count_per_second_total_cum": 0.09931352241331184, "throughput/batch_count_per_second_update_recent": 0.1793895188610549, "throughput/batch_count_per_second_update_cum": 0.17926026716142052, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1677721600, "throughput/token_count": 1677721600, "throughput/batch_count": 800, "throughput/flop_count": 0, "throughput/total_time": 8010.43765124795, "throughput/update_time": 4462.722712960327, "throughput/token_count_per_second_total_recent": 226972.55676837225, "throughput/token_count_per_second_total_cum": 209441.939759512, "throughput/token_count_per_second_update_recent": 376222.2851104471, "throughput/token_count_per_second_update_cum": 375941.2600580534, "throughput/batch_count_per_second_total_recent": 0.10822894895952809, "throughput/batch_count_per_second_total_cum": 0.0998696993634758, "throughput/batch_count_per_second_update_recent": 0.17939676528475146, "throughput/batch_count_per_second_update_cum": 0.17926276209738418, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1698693120, "throughput/token_count": 1698693120, "throughput/batch_count": 810, "throughput/flop_count": 0, "throughput/total_time": 8157.16301853396, "throughput/update_time": 4518.44748669432, "throughput/token_count_per_second_total_recent": 206463.68557568113, "throughput/token_count_per_second_total_cum": 208245.57706403377, "throughput/token_count_per_second_update_recent": 376240.66110605845, "throughput/token_count_per_second_update_cum": 375946.1905891835, "throughput/batch_count_per_second_total_recent": 0.09844955710205132, "throughput/batch_count_per_second_total_cum": 0.09929922917558373, "throughput/batch_count_per_second_update_recent": 0.17940552764227793, "throughput/batch_count_per_second_update_cum": 0.17926511315783666, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1719664640, "throughput/token_count": 1719664640, "throughput/batch_count": 820, "throughput/flop_count": 0, "throughput/total_time": 8212.996373801958, "throughput/update_time": 4574.1805559834465, "throughput/token_count_per_second_total_recent": 226873.61283407378, "throughput/token_count_per_second_total_cum": 209383.34339041397, "throughput/token_count_per_second_update_recent": 376261.1837742265, "throughput/token_count_per_second_update_cum": 375950.31917804846, "throughput/batch_count_per_second_total_recent": 0.10818176881507577, "throughput/batch_count_per_second_total_cum": 0.09984175843735407, "throughput/batch_count_per_second_update_recent": 0.17941531361304594, "throughput/batch_count_per_second_update_cum": 0.17926708182241843, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1740636160, "throughput/token_count": 1740636160, "throughput/batch_count": 830, "throughput/flop_count": 0, "throughput/total_time": 8359.302277855924, "throughput/update_time": 4629.922305631335, "throughput/token_count_per_second_total_recent": 206466.98347613928, "throughput/token_count_per_second_total_cum": 208227.44556217382, "throughput/token_count_per_second_update_recent": 376259.9252157951, "throughput/token_count_per_second_update_cum": 375953.64351641043, "throughput/batch_count_per_second_total_recent": 0.09845112966353382, "throughput/batch_count_per_second_total_cum": 0.09929058340176287, "throughput/batch_count_per_second_update_recent": 0.17941471348562008, "throughput/batch_count_per_second_update_cum": 0.17926866699047586, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1761607680, "throughput/token_count": 1761607680, "throughput/batch_count": 840, "throughput/flop_count": 0, "throughput/total_time": 8415.15657795791, "throughput/update_time": 4685.668182700407, "throughput/token_count_per_second_total_recent": 226881.55480728505, "throughput/token_count_per_second_total_cum": 209337.48097025734, "throughput/token_count_per_second_update_recent": 376259.0087638217, "throughput/token_count_per_second_update_cum": 375956.5575949009, "throughput/batch_count_per_second_total_recent": 0.10818555584301236, "throughput/batch_count_per_second_total_cum": 0.09981988953125827, "throughput/batch_count_per_second_update_recent": 0.17941427648726543, "throughput/batch_count_per_second_update_cum": 0.17927005653138203, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1782579200, "throughput/token_count": 1782579200, "throughput/batch_count": 850, "throughput/flop_count": 0, "throughput/total_time": 8561.475588510977, "throughput/update_time": 4741.406085017603, "throughput/token_count_per_second_total_recent": 206470.79602397332, "throughput/token_count_per_second_total_cum": 208209.34213631606, "throughput/token_count_per_second_update_recent": 376264.0846583086, "throughput/token_count_per_second_update_cum": 375960.0354909027, "throughput/batch_count_per_second_total_recent": 0.09845294762800852, "throughput/batch_count_per_second_total_cum": 0.09928195101562312, "throughput/batch_count_per_second_update_recent": 0.1794166968623679, "throughput/batch_count_per_second_update_cum": 0.17927171492142807, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1803550720, "throughput/token_count": 1803550720, "throughput/batch_count": 860, "throughput/flop_count": 0, "throughput/total_time": 8617.329412178951, "throughput/update_time": 4797.150553135551, "throughput/token_count_per_second_total_recent": 226877.19676979736, "throughput/token_count_per_second_total_cum": 209293.46363979368, "throughput/token_count_per_second_update_recent": 376267.15195650654, "throughput/token_count_per_second_update_cum": 375962.91799121234, "throughput/batch_count_per_second_total_recent": 0.10818347776880138, "throughput/batch_count_per_second_total_cum": 0.09979890043248829, "throughput/batch_count_per_second_update_recent": 0.17941815946412398, "throughput/batch_count_per_second_update_cum": 0.17927308940468423, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1824522240, "throughput/token_count": 1824522240, "throughput/batch_count": 870, "throughput/flop_count": 0, "throughput/total_time": 8763.651462373906, "throughput/update_time": 4852.897546026274, "throughput/token_count_per_second_total_recent": 206464.18959566348, "throughput/token_count_per_second_total_cum": 208192.01309333812, "throughput/token_count_per_second_update_recent": 376262.1906566753, "throughput/token_count_per_second_update_cum": 375965.53866956953, "throughput/batch_count_per_second_total_recent": 0.09844979743750738, "throughput/batch_count_per_second_total_cum": 0.09927368788401514, "throughput/batch_count_per_second_update_recent": 0.17941579373201147, "throughput/batch_count_per_second_update_cum": 0.17927433904150464, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1845493760, "throughput/token_count": 1845493760, "throughput/batch_count": 880, "throughput/flop_count": 0, "throughput/total_time": 8819.515494428924, "throughput/update_time": 4908.649304769933, "throughput/token_count_per_second_total_recent": 226870.06222666817, "throughput/token_count_per_second_total_cum": 209251.1500394499, "throughput/token_count_per_second_update_recent": 376255.0857170101, "throughput/token_count_per_second_update_cum": 375967.73479145457, "throughput/batch_count_per_second_total_recent": 0.10818007575353059, "throughput/batch_count_per_second_total_cum": 0.09977872373554701, "throughput/batch_count_per_second_update_recent": 0.1794124058327723, "throughput/batch_count_per_second_update_cum": 0.17927538623402337, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1866465280, "throughput/token_count": 1866465280, "throughput/batch_count": 890, "throughput/flop_count": 0, "throughput/total_time": 8966.25939859997, "throughput/update_time": 4964.410062348121, "throughput/token_count_per_second_total_recent": 206367.16804007607, "throughput/token_count_per_second_total_cum": 208165.4341041525, "throughput/token_count_per_second_update_recent": 376230.21470404294, "throughput/token_count_per_second_update_cum": 375969.2000779603, "throughput/batch_count_per_second_total_recent": 0.09840353395465663, "throughput/batch_count_per_second_total_cum": 0.09926101403434395, "throughput/batch_count_per_second_update_recent": 0.1794005464096274, "throughput/batch_count_per_second_update_cum": 0.17927608493707672, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1887436800, "throughput/token_count": 1887436800, "throughput/batch_count": 900, "throughput/flop_count": 0, "throughput/total_time": 9022.126939511974, "throughput/update_time": 5020.160163923283, "throughput/token_count_per_second_total_recent": 226851.7948138724, "throughput/token_count_per_second_total_cum": 209200.86944621237, "throughput/token_count_per_second_update_recent": 376211.6317829525, "throughput/token_count_per_second_update_cum": 375971.4308646594, "throughput/batch_count_per_second_total_recent": 0.10817136517232533, "throughput/batch_count_per_second_total_cum": 0.09975474808035487, "throughput/batch_count_per_second_update_recent": 0.17939168538234354, "throughput/batch_count_per_second_update_cum": 0.17927714865906688, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1908408320, "throughput/token_count": 1908408320, "throughput/batch_count": 910, "throughput/flop_count": 0, "throughput/total_time": 9169.132314599934, "throughput/update_time": 5075.909138886142, "throughput/token_count_per_second_total_recent": 206302.74172667554, "throughput/token_count_per_second_total_cum": 208134.01470510542, "throughput/token_count_per_second_update_recent": 376196.74805601686, "throughput/token_count_per_second_update_cum": 375973.69609708607, "throughput/batch_count_per_second_total_recent": 0.09837281309446122, "throughput/batch_count_per_second_total_cum": 0.0992460320973899, "throughput/batch_count_per_second_update_recent": 0.17938458826828807, "throughput/batch_count_per_second_update_cum": 0.17927822880605987, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1929379840, "throughput/token_count": 1929379840, "throughput/batch_count": 920, "throughput/flop_count": 0, "throughput/total_time": 9224.990330599947, "throughput/update_time": 5131.661126127117, "throughput/token_count_per_second_total_recent": 226670.8916768228, "throughput/token_count_per_second_total_cum": 209147.08534708273, "throughput/token_count_per_second_update_recent": 376185.23009396135, "throughput/token_count_per_second_update_cum": 375975.6914143919, "throughput/batch_count_per_second_total_recent": 0.10808510383454456, "throughput/batch_count_per_second_total_cum": 0.09972910182336937, "throughput/batch_count_per_second_update_recent": 0.17937909607599323, "throughput/batch_count_per_second_update_cum": 0.1792791802474937, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1950351360, "throughput/token_count": 1950351360, "throughput/batch_count": 930, "throughput/flop_count": 0, "throughput/total_time": 9371.526648397907, "throughput/update_time": 5187.4173517230665, "throughput/token_count_per_second_total_recent": 206250.52564625544, "throughput/token_count_per_second_total_cum": 208114.58294614346, "throughput/token_count_per_second_update_recent": 376174.1636389404, "throughput/token_count_per_second_update_cum": 375977.3366513813, "throughput/batch_count_per_second_total_recent": 0.0983479145270612, "throughput/batch_count_per_second_total_cum": 0.09923676631266759, "throughput/batch_count_per_second_update_recent": 0.17937381917902964, "throughput/batch_count_per_second_update_cum": 0.17927996475762428, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1971322880, "throughput/token_count": 1971322880, "throughput/batch_count": 940, "throughput/flop_count": 0, "throughput/total_time": 9427.379897051957, "throughput/update_time": 5243.171735763899, "throughput/token_count_per_second_total_recent": 226618.39494838653, "throughput/token_count_per_second_total_cum": 209106.12508746507, "throughput/token_count_per_second_update_recent": 376168.38491395547, "throughput/token_count_per_second_update_cum": 375979.07895206293, "throughput/batch_count_per_second_total_recent": 0.10806007144374205, "throughput/batch_count_per_second_total_cum": 0.09970957044957403, "throughput/batch_count_per_second_update_recent": 0.1793710636682298, "throughput/batch_count_per_second_update_cum": 0.17928079555133006, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1992294400, "throughput/token_count": 1992294400, "throughput/batch_count": 950, "throughput/flop_count": 0, "throughput/total_time": 9573.927157147904, "throughput/update_time": 5298.920672355802, "throughput/token_count_per_second_total_recent": 206203.64636819714, "throughput/token_count_per_second_total_cum": 208095.83855174322, "throughput/token_count_per_second_update_recent": 376160.4174898452, "throughput/token_count_per_second_update_cum": 375981.17110786313, "throughput/batch_count_per_second_total_recent": 0.09832556074533326, "throughput/batch_count_per_second_total_cum": 0.09922782828890954, "throughput/batch_count_per_second_update_recent": 0.17936726450435886, "throughput/batch_count_per_second_update_cum": 0.17928179316895634, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2013265920, "throughput/token_count": 2013265920, "throughput/batch_count": 960, "throughput/flop_count": 0, "throughput/total_time": 9629.798734208918, "throughput/update_time": 5354.673169916612, "throughput/token_count_per_second_total_recent": 226555.96491027475, "throughput/token_count_per_second_total_cum": 209066.25107833976, "throughput/token_count_per_second_update_recent": 376155.3104339124, "throughput/token_count_per_second_update_cum": 375982.969662993, "throughput/batch_count_per_second_total_recent": 0.10803030248178232, "throughput/batch_count_per_second_total_cum": 0.09969055704037655, "throughput/batch_count_per_second_update_recent": 0.17936482927032107, "throughput/batch_count_per_second_update_cum": 0.17928265078687333, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2034237440, "throughput/token_count": 2034237440, "throughput/batch_count": 970, "throughput/flop_count": 0, "throughput/total_time": 9776.485576101928, "throughput/update_time": 5410.419541045674, "throughput/token_count_per_second_total_recent": 206126.02456977172, "throughput/token_count_per_second_total_cum": 208074.50940986193, "throughput/token_count_per_second_update_recent": 376157.2709419477, "throughput/token_count_per_second_update_cum": 375985.15689355246, "throughput/batch_count_per_second_total_recent": 0.09828854778755747, "throughput/batch_count_per_second_total_cum": 0.09921765776150795, "throughput/batch_count_per_second_update_recent": 0.17936576411340127, "throughput/batch_count_per_second_update_cum": 0.17928369373967765, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2055208960, "throughput/token_count": 2055208960, "throughput/batch_count": 980, "throughput/flop_count": 0, "throughput/total_time": 9832.331430819933, "throughput/update_time": 5466.160342909512, "throughput/token_count_per_second_total_recent": 226569.3367528444, "throughput/token_count_per_second_total_cum": 209025.59829887803, "throughput/token_count_per_second_update_recent": 376164.06115470035, "throughput/token_count_per_second_update_cum": 375987.68259074877, "throughput/batch_count_per_second_total_recent": 0.10803667867319316, "throughput/batch_count_per_second_total_cum": 0.09967117228454496, "throughput/batch_count_per_second_update_recent": 0.17936900193915384, "throughput/batch_count_per_second_update_cum": 0.17928489808595122, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} diff --git a/metrics/jsonlines/train.jsonl b/metrics/jsonlines/train.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2268e69da76cc954efc1289713e87fd3c25ee47b --- /dev/null +++ b/metrics/jsonlines/train.jsonl @@ -0,0 +1,98 @@ +{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 59.51673407689668, "train/update_time": 59.329752838937566, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346482276916504} +{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 115.38901872490533, "train/update_time": 115.08477980294265, "train/lr": 0.0009997960964140947, "train/loss": 8.126626968383789, "train/global_grad_norm": 0.9628385305404663} +{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 262.1596394549124, "train/update_time": 170.83132880181074, "train/lr": 0.0009990914580222257, "train/loss": 7.51987886428833, "train/global_grad_norm": 0.570928692817688} +{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 318.01136298198253, "train/update_time": 226.56994250579737, "train/lr": 0.0009978842768382998, "train/loss": 7.193209648132324, "train/global_grad_norm": 0.4207130968570709} +{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 465.39904637495056, "train/update_time": 282.29461103421636, "train/lr": 0.0009961757683914405, "train/loss": 6.9471588134765625, "train/global_grad_norm": 0.2690547704696655} +{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 521.2494050179375, "train/update_time": 338.0269747101702, "train/lr": 0.00099396765300483, "train/loss": 6.683192729949951, "train/global_grad_norm": 0.39732715487480164} +{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 667.8293691409053, "train/update_time": 393.75072771217674, "train/lr": 0.0009912621540634887, "train/loss": 6.482468128204346, "train/global_grad_norm": 0.30048173666000366} +{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 723.6805678269593, "train/update_time": 449.4893446461065, "train/lr": 0.000988061995775515, "train/loss": 6.281726837158203, "train/global_grad_norm": 0.3598792552947998} +{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 871.0406558898976, "train/update_time": 505.2065281631658, "train/lr": 0.0009843704004290394, "train/loss": 6.091277122497559, "train/global_grad_norm": 0.38550785183906555} +{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 926.8863875919487, "train/update_time": 560.9345708230976, "train/lr": 0.0009801910851476522, "train/loss": 5.968027114868164, "train/global_grad_norm": 0.38172265887260437} +{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1073.8704686219571, "train/update_time": 616.6548248290783, "train/lr": 0.0009755282581475768, "train/loss": 5.854724407196045, "train/global_grad_norm": 0.6797294020652771} +{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1129.7145008929074, "train/update_time": 672.3896088181064, "train/lr": 0.0009703866145003512, "train/loss": 5.713388442993164, "train/global_grad_norm": 0.4913008511066437} +{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1276.169682028005, "train/update_time": 728.117079011281, "train/lr": 0.0009647713314052896, "train/loss": 5.644865036010742, "train/global_grad_norm": 0.3544484078884125} +{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1332.0184532779967, "train/update_time": 783.8552743501496, "train/lr": 0.0009586880629764817, "train/loss": 5.568542957305908, "train/global_grad_norm": 0.49100813269615173} +{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1478.8752941149287, "train/update_time": 839.5983157731825, "train/lr": 0.0009521429345495787, "train/loss": 5.447315216064453, "train/global_grad_norm": 0.36811110377311707} +{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1534.7253116948996, "train/update_time": 895.3389847053913, "train/lr": 0.0009451425365140996, "train/loss": 5.412559986114502, "train/global_grad_norm": 0.8786026239395142} +{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1681.113392906962, "train/update_time": 951.0781835562084, "train/lr": 0.000937693917677468, "train/loss": 5.300525188446045, "train/global_grad_norm": 0.37146928906440735} +{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1736.965916060959, "train/update_time": 1006.818993799272, "train/lr": 0.0009298045781674596, "train/loss": 5.27311372756958, "train/global_grad_norm": 0.6399803161621094} +{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1883.3845960129984, "train/update_time": 1062.5558878729353, "train/lr": 0.0009214824618802108, "train/loss": 5.24492073059082, "train/global_grad_norm": 0.5269308090209961} +{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1939.2464721049182, "train/update_time": 1118.2928310850402, "train/lr": 0.000912735948481387, "train/loss": 5.150586128234863, "train/global_grad_norm": 0.5451287031173706} +{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2086.0689364429563, "train/update_time": 1174.0314252841054, "train/lr": 0.0009035738449685707, "train/loss": 5.106629848480225, "train/global_grad_norm": 0.4432642161846161} +{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2141.922117186943, "train/update_time": 1229.7717256471515, "train/lr": 0.0008940053768033609, "train/loss": 5.072513103485107, "train/global_grad_norm": 0.4362945556640625} +{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2288.3046315449756, "train/update_time": 1285.5049123089993, "train/lr": 0.0008840401786221159, "train/loss": 5.0194478034973145, "train/global_grad_norm": 0.5716229677200317} +{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2344.147989144898, "train/update_time": 1341.2354510270525, "train/lr": 0.0008736882845346905, "train/loss": 4.964856147766113, "train/global_grad_norm": 0.4435354769229889} +{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2491.0224517869065, "train/update_time": 1396.97127348301, "train/lr": 0.0008629601180209381, "train/loss": 4.966379165649414, "train/global_grad_norm": 0.7064673900604248} +{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2546.883552026935, "train/update_time": 1452.7241989910835, "train/lr": 0.0008518664814351503, "train/loss": 4.9135613441467285, "train/global_grad_norm": 0.5128141045570374} +{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2693.2529326969525, "train/update_time": 1508.4631838970818, "train/lr": 0.0008404185451290017, "train/loss": 4.897538661956787, "train/global_grad_norm": 0.376214861869812} +{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2749.1005825489992, "train/update_time": 1564.2029820160242, "train/lr": 0.0008286278362039527, "train/loss": 4.849531173706055, "train/global_grad_norm": 0.5354404449462891} +{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2895.4641819539247, "train/update_time": 1619.943224380957, "train/lr": 0.0008165062269044352, "train/loss": 4.818095684051514, "train/global_grad_norm": 0.47904765605926514} +{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 2951.315687590977, "train/update_time": 1675.687991371029, "train/lr": 0.0008040659226635089, "train/loss": 4.798238277435303, "train/global_grad_norm": 0.6568677425384521} +{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3098.1222598329186, "train/update_time": 1731.4270116091939, "train/lr": 0.0007913194498130252, "train/loss": 4.810272216796875, "train/global_grad_norm": 0.5013900399208069} +{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3153.9731954459567, "train/update_time": 1787.1771466861246, "train/lr": 0.000778279642970672, "train/loss": 4.744162559509277, "train/global_grad_norm": 0.5148316621780396} +{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3300.597545653931, "train/update_time": 1842.9211868423736, "train/lr": 0.0007649596321166025, "train/loss": 4.759054660797119, "train/global_grad_norm": 0.4511776864528656} +{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3356.450910591986, "train/update_time": 1898.6706604874926, "train/lr": 0.0007513728293726579, "train/loss": 4.724034309387207, "train/global_grad_norm": 0.5304577946662903} +{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3502.8775023539783, "train/update_time": 1954.4126074366504, "train/lr": 0.0007375329154974975, "train/loss": 4.703649044036865, "train/global_grad_norm": 0.4927815794944763} +{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 3558.7265775619308, "train/update_time": 2010.1547237539198, "train/lr": 0.0007234538261112341, "train/loss": 4.634231090545654, "train/global_grad_norm": 0.5631891489028931} +{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 3705.2644805479795, "train/update_time": 2065.915084464941, "train/lr": 0.0007091497376634464, "train/loss": 4.656914234161377, "train/global_grad_norm": 0.5161179304122925} +{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 3761.109741097898, "train/update_time": 2121.6608164007775, "train/lr": 0.0006946350531586958, "train/loss": 4.634555339813232, "train/global_grad_norm": 0.5455919504165649} +{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 3907.6489149519475, "train/update_time": 2177.411766545032, "train/lr": 0.0006799243876539214, "train/loss": 4.638850688934326, "train/global_grad_norm": 0.5190625190734863} +{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 3963.4901722619543, "train/update_time": 2233.1504810712067, "train/lr": 0.0006650325535423166, "train/loss": 4.546455383300781, "train/global_grad_norm": 0.47769442200660706} +{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4110.402928694966, "train/update_time": 2288.885604837211, "train/lr": 0.0006499745456385053, "train/loss": 4.568233489990234, "train/global_grad_norm": 0.5308087468147278} +{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4166.242571576964, "train/update_time": 2344.6231456701644, "train/lr": 0.0006347655260800339, "train/loss": 4.565241813659668, "train/global_grad_norm": 0.5333735942840576} +{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4312.675729307928, "train/update_time": 2400.370607822202, "train/lr": 0.0006194208090603844, "train/loss": 4.559530258178711, "train/global_grad_norm": 0.4546603262424469} +{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4368.526084997924, "train/update_time": 2456.1160258102464, "train/lr": 0.0006039558454088796, "train/loss": 4.5818610191345215, "train/global_grad_norm": 0.5177769660949707} +{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 4515.121015235898, "train/update_time": 2511.850276188343, "train/lr": 0.0005883862070330078, "train/loss": 4.528136253356934, "train/global_grad_norm": 0.4562700092792511} +{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 4570.982028090977, "train/update_time": 2567.596293253242, "train/lr": 0.0005727275712388317, "train/loss": 4.493772506713867, "train/global_grad_norm": 0.4743252992630005} +{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 4717.434516418958, "train/update_time": 2623.332695179037, "train/lr": 0.0005569957049452703, "train/loss": 4.519767761230469, "train/global_grad_norm": 0.6404874324798584} +{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 4773.292612125981, "train/update_time": 2679.07335356588, "train/lr": 0.0005412064488081482, "train/loss": 4.494389533996582, "train/global_grad_norm": 0.44306617975234985} +{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 4919.765502344933, "train/update_time": 2734.8176154628163, "train/lr": 0.0005253757012699972, "train/loss": 4.489535331726074, "train/global_grad_norm": 0.45535698533058167} +{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 4975.63103498891, "train/update_time": 2790.5577287059277, "train/lr": 0.0005095194025516734, "train/loss": 4.462835311889648, "train/global_grad_norm": 0.46494054794311523} +{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5122.565231077955, "train/update_time": 2846.292529964936, "train/lr": 0.0004936535186019053, "train/loss": 4.4626970291137695, "train/global_grad_norm": 0.42456820607185364} +{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5178.418338532909, "train/update_time": 2902.025444978848, "train/lr": 0.00047779402502093696, "train/loss": 4.453117370605469, "train/global_grad_norm": 0.47793683409690857} +{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 5324.87164538994, "train/update_time": 2957.746671211091, "train/lr": 0.0004619568909744525, "train/loss": 4.413782119750977, "train/global_grad_norm": 0.5748726725578308} +{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 5380.720167659922, "train/update_time": 3013.4729040842503, "train/lr": 0.00044615806311398067, "train/loss": 4.423956871032715, "train/global_grad_norm": 0.5001558661460876} +{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 5527.108590850956, "train/update_time": 3069.2099333773367, "train/lr": 0.0004304134495199673, "train/loss": 4.370279788970947, "train/global_grad_norm": 0.5166635513305664} +{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 5582.958667394938, "train/update_time": 3124.937779762433, "train/lr": 0.0004147389036836882, "train/loss": 4.411366939544678, "train/global_grad_norm": 0.4276511073112488} +{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 5729.297242018976, "train/update_time": 3180.655704040313, "train/lr": 0.0003991502085441259, "train/loss": 4.3608832359313965, "train/global_grad_norm": 0.44203898310661316} +{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 5785.135669226991, "train/update_time": 3236.387389887357, "train/lr": 0.0003836630605958888, "train/loss": 4.409422874450684, "train/global_grad_norm": 0.44176411628723145} +{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 5931.5355288069695, "train/update_time": 3292.107882911223, "train/lr": 0.00036829305408417155, "train/loss": 4.389522552490234, "train/global_grad_norm": 0.354754239320755} +{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 5987.366702231928, "train/update_time": 3347.8448207870824, "train/lr": 0.000353055665302672, "train/loss": 4.390128135681152, "train/global_grad_norm": 0.6542595028877258} +{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6134.471666511963, "train/update_time": 3403.5747455290984, "train/lr": 0.0003379662370102746, "train/loss": 4.354618549346924, "train/global_grad_norm": 0.4821139872074127} +{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6190.319736288977, "train/update_time": 3459.312256404897, "train/lr": 0.00032303996298219405, "train/loss": 4.3281426429748535, "train/global_grad_norm": 0.4146212339401245} +{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 6336.646818120964, "train/update_time": 3515.0587793228915, "train/lr": 0.00030829187271113034, "train/loss": 4.3390960693359375, "train/global_grad_norm": 0.4168533980846405} +{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 6392.506873650011, "train/update_time": 3570.8129316339036, "train/lr": 0.0002937368162738445, "train/loss": 4.329615592956543, "train/global_grad_norm": 0.4574236571788788} +{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 6538.855644405005, "train/update_time": 3626.554348448757, "train/lr": 0.0002793894493783894, "train/loss": 4.302555561065674, "train/global_grad_norm": 0.41884133219718933} +{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 6594.719191052951, "train/update_time": 3682.304503756459, "train/lr": 0.00026526421860705474, "train/loss": 4.3252763748168945, "train/global_grad_norm": 0.5183114409446716} +{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 6741.161298968946, "train/update_time": 3738.0453335597413, "train/lr": 0.0002513753468698824, "train/loss": 4.268364429473877, "train/global_grad_norm": 0.4133932888507843} +{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 6797.037191045936, "train/update_time": 3793.7997314956738, "train/lr": 0.00023773681908340283, "train/loss": 4.28262996673584, "train/global_grad_norm": 0.4412442445755005} +{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 6943.39617751597, "train/update_time": 3849.5505883715814, "train/lr": 0.00022436236808900823, "train/loss": 4.283445358276367, "train/global_grad_norm": 0.3459267318248749} +{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 6999.246100232936, "train/update_time": 3905.29758988251, "train/lr": 0.00021126546082514682, "train/loss": 4.279012203216553, "train/global_grad_norm": 0.3911832273006439} +{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 7145.973933565896, "train/update_time": 3961.0490365702426, "train/lr": 0.00019845928476725522, "train/loss": 4.275318622589111, "train/global_grad_norm": 0.3508543074131012} +{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 7201.8562647239305, "train/update_time": 4016.812845747103, "train/lr": 0.0001859567346490913, "train/loss": 4.250895023345947, "train/global_grad_norm": 0.38312408328056335} +{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 7348.130815029959, "train/update_time": 4072.5528116449714, "train/lr": 0.00017377039947882782, "train/loss": 4.26820707321167, "train/global_grad_norm": 0.4197799861431122} +{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 7404.001054160995, "train/update_time": 4128.298564241966, "train/lr": 0.00016191254986299043, "train/loss": 4.2526726722717285, "train/global_grad_norm": 0.33865150809288025} +{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 7550.323625019984, "train/update_time": 4184.042193662957, "train/lr": 0.00015039512565099468, "train/loss": 4.2354736328125, "train/global_grad_norm": 0.3443734049797058} +{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 7606.181935134926, "train/update_time": 4239.792019490269, "train/lr": 0.00013922972391273224, "train/loss": 4.196894645690918, "train/global_grad_norm": 0.37375685572624207} +{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 7752.467774236924, "train/update_time": 4295.5320715362905, "train/lr": 0.00012842758726130281, "train/loss": 4.261632919311523, "train/global_grad_norm": 0.3186478614807129} +{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 7808.319425186957, "train/update_time": 4351.274368522107, "train/lr": 0.00011799959253265679, "train/loss": 4.1832122802734375, "train/global_grad_norm": 0.3403545916080475} +{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 7954.606591358897, "train/update_time": 4407.000014613499, "train/lr": 0.00010795623983354214, "train/loss": 4.212279319763184, "train/global_grad_norm": 0.3116688132286072} +{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 8010.43765124795, "train/update_time": 4462.722712960327, "train/lr": 9.830764196878872e-05, "train/loss": 4.190274715423584, "train/global_grad_norm": 0.3231128454208374} +{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 8157.16301853396, "train/update_time": 4518.44748669432, "train/lr": 8.906351425856951e-05, "train/loss": 4.165951251983643, "train/global_grad_norm": 0.29648908972740173} +{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 8212.996373801958, "train/update_time": 4574.1805559834465, "train/lr": 8.02331647558977e-05, "train/loss": 4.1781415939331055, "train/global_grad_norm": 0.2801770269870758} +{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 8359.302277855924, "train/update_time": 4629.922305631335, "train/lr": 7.182548487420554e-05, "train/loss": 4.2106523513793945, "train/global_grad_norm": 0.2964642345905304} +{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 8415.15657795791, "train/update_time": 4685.668182700407, "train/lr": 6.384894043444556e-05, "train/loss": 4.159307479858398, "train/global_grad_norm": 0.2973722517490387} +{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 8561.475588510977, "train/update_time": 4741.406085017603, "train/lr": 5.6311563140726166e-05, "train/loss": 4.228494167327881, "train/global_grad_norm": 0.2686772048473358} +{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 8617.329412178951, "train/update_time": 4797.150553135551, "train/lr": 4.922094249306547e-05, "train/loss": 4.207626819610596, "train/global_grad_norm": 0.25773489475250244} +{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 8763.651462373906, "train/update_time": 4852.897546026274, "train/lr": 4.2584218145409916e-05, "train/loss": 4.153430461883545, "train/global_grad_norm": 0.2575656473636627} +{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 8819.515494428924, "train/update_time": 4908.649304769933, "train/lr": 3.6408072716606236e-05, "train/loss": 4.171026229858398, "train/global_grad_norm": 0.27688679099082947} +{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 8966.25939859997, "train/update_time": 4964.410062348121, "train/lr": 3.069872506157217e-05, "train/loss": 4.226474285125732, "train/global_grad_norm": 0.26012977957725525} +{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 9022.126939511974, "train/update_time": 5020.160163923283, "train/lr": 2.5461924009435368e-05, "train/loss": 4.141654014587402, "train/global_grad_norm": 0.25259116291999817} +{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 9169.132314599934, "train/update_time": 5075.909138886142, "train/lr": 2.0702942574950812e-05, "train/loss": 4.176390647888184, "train/global_grad_norm": 0.2537875175476074} +{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 9224.990330599947, "train/update_time": 5131.661126127117, "train/lr": 1.642657264902142e-05, "train/loss": 4.204928398132324, "train/global_grad_norm": 0.23280541598796844} +{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 9371.526648397907, "train/update_time": 5187.4173517230665, "train/lr": 1.2637120173670358e-05, "train/loss": 4.189211845397949, "train/global_grad_norm": 0.22280284762382507} +{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 9427.379897051957, "train/update_time": 5243.171735763899, "train/lr": 9.338400806321978e-06, "train/loss": 4.146426200866699, "train/global_grad_norm": 0.22602267563343048} +{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 9573.927157147904, "train/update_time": 5298.920672355802, "train/lr": 6.533736077758867e-06, "train/loss": 4.1687397956848145, "train/global_grad_norm": 0.22272436320781708} +{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 9629.798734208918, "train/update_time": 5354.673169916612, "train/lr": 4.2259500476214406e-06, "train/loss": 4.167335510253906, "train/global_grad_norm": 0.21395571529865265} +{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 9776.485576101928, "train/update_time": 5410.419541045674, "train/lr": 2.417366460819359e-06, "train/loss": 4.191149711608887, "train/global_grad_norm": 0.21384510397911072} +{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 9832.331430819933, "train/update_time": 5466.160342909512, "train/lr": 1.1098064077174619e-06, "train/loss": 4.1665802001953125, "train/global_grad_norm": 0.20814913511276245} diff --git a/metrics/jsonlines/train_data_info.jsonl b/metrics/jsonlines/train_data_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8f2d82aa96124ce343d2b13b9c030bd8eadda7c6 --- /dev/null +++ b/metrics/jsonlines/train_data_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "train_data_info/vocab_size": 50277, "train_data_info/global_tokens_per_batch": 2097152, "train_data_info/local_tokens_per_batch": 2097152, "train_data_info/batch_len": 2048, "train_data_info/seq_len": 2048, "train_data_info/total_tokens": 2055208960, "train_data_info/global_batch_size": 1024, "train_data_info/local_batch_size": 1024} diff --git a/metrics/jsonlines/train_eval.jsonl b/metrics/jsonlines/train_eval.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d077c452d4380312d176858b71bd2b10df84782 --- /dev/null +++ b/metrics/jsonlines/train_eval.jsonl @@ -0,0 +1,19 @@ +{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 465.39904637495056, "train_eval/train_update_time": 282.29461103421636, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262738060667653, "train_eval/perplexity_len_2048": 3876.6942059185167, "train_eval/loss_avg_len_1024": 8.26358599447005, "train_eval/perplexity_len_1024": 3879.9827800268095, "train_eval/loss_avg_len_512": 8.264393082915195, "train_eval/perplexity_len_512": 3883.115533330371} +{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 926.8863875919487, "train_eval/train_update_time": 560.9345708230976, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.3983804440147285, "train_eval/perplexity_len_2048": 600.8711050210618, "train_eval/loss_avg_len_1024": 6.4026496673026125, "train_eval/perplexity_len_1024": 603.4418415551971, "train_eval/loss_avg_len_512": 6.409003813466989, "train_eval/perplexity_len_512": 607.2884070958403} +{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1478.8752941149287, "train_eval/train_update_time": 839.5983157731825, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.694461489945333, "train_eval/perplexity_len_2048": 297.21669630360435, "train_eval/loss_avg_len_1024": 5.700237832467756, "train_eval/perplexity_len_1024": 298.9384897918442, "train_eval/loss_avg_len_512": 5.711870585765283, "train_eval/perplexity_len_512": 302.4362724727184} +{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1939.2464721049182, "train_eval/train_update_time": 1118.2928310850402, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.298926106473736, "train_eval/perplexity_len_2048": 200.12178504909002, "train_eval/loss_avg_len_1024": 5.307277373590877, "train_eval/perplexity_len_1024": 201.80005361248934, "train_eval/loss_avg_len_512": 5.322380470788557, "train_eval/perplexity_len_512": 204.8709913985368} +{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2491.0224517869065, "train_eval/train_update_time": 1396.97127348301, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.0484253960391285, "train_eval/perplexity_len_2048": 155.77698421365454, "train_eval/loss_avg_len_1024": 5.056525684503576, "train_eval/perplexity_len_1024": 157.04394717703963, "train_eval/loss_avg_len_512": 5.07371008965827, "train_eval/perplexity_len_512": 159.76597522371915} +{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2951.315687590977, "train_eval/train_update_time": 1675.687991371029, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.881842594247683, "train_eval/perplexity_len_2048": 131.87342938039131, "train_eval/loss_avg_len_1024": 4.889910663574483, "train_eval/perplexity_len_1024": 132.94169698475255, "train_eval/loss_avg_len_512": 4.908835977258713, "train_eval/perplexity_len_512": 135.4816189020749} +{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3502.8775023539783, "train_eval/train_update_time": 1954.4126074366504, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.751916624681217, "train_eval/perplexity_len_2048": 115.80602865192392, "train_eval/loss_avg_len_1024": 4.763219690434598, "train_eval/perplexity_len_1024": 117.12241740770119, "train_eval/loss_avg_len_512": 4.7850995411058825, "train_eval/perplexity_len_512": 119.71327888296197} +{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3963.4901722619543, "train_eval/train_update_time": 2233.1504810712067, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.652960418538351, "train_eval/perplexity_len_2048": 104.89505965531511, "train_eval/loss_avg_len_1024": 4.663964104556035, "train_eval/perplexity_len_1024": 106.05566571832371, "train_eval/loss_avg_len_512": 4.687754835745363, "train_eval/perplexity_len_512": 108.60906068541759} +{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4515.121015235898, "train_eval/train_update_time": 2511.850276188343, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.568486952575722, "train_eval/perplexity_len_2048": 96.39814441032826, "train_eval/loss_avg_len_1024": 4.583207558359009, "train_eval/perplexity_len_1024": 97.82767948912803, "train_eval/loss_avg_len_512": 4.6114602793406085, "train_eval/perplexity_len_512": 100.6309917533228} +{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4975.63103498891, "train_eval/train_update_time": 2790.5577287059277, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.499369324265025, "train_eval/perplexity_len_2048": 89.96037757855736, "train_eval/loss_avg_len_1024": 4.513876748953916, "train_eval/perplexity_len_1024": 91.27498369940146, "train_eval/loss_avg_len_512": 4.5444240906891356, "train_eval/perplexity_len_512": 94.10621495483142} +{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5527.108590850956, "train_eval/train_update_time": 3069.2099333773367, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.433416955778939, "train_eval/perplexity_len_2048": 84.21869738313693, "train_eval/loss_avg_len_1024": 4.447785850001091, "train_eval/perplexity_len_1024": 85.43756284075022, "train_eval/loss_avg_len_512": 4.478819852198358, "train_eval/perplexity_len_512": 88.1306041408694} +{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5987.366702231928, "train_eval/train_update_time": 3347.8448207870824, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.377168502442927, "train_eval/perplexity_len_2048": 79.61229195254344, "train_eval/loss_avg_len_1024": 4.392655870016097, "train_eval/perplexity_len_1024": 80.85487410774638, "train_eval/loss_avg_len_512": 4.42795208584983, "train_eval/perplexity_len_512": 83.75970845603244} +{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6538.855644405005, "train_eval/train_update_time": 3626.554348448757, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.331787606836443, "train_eval/perplexity_len_2048": 76.08016650881756, "train_eval/loss_avg_len_1024": 4.353677473009084, "train_eval/perplexity_len_1024": 77.76391242617439, "train_eval/loss_avg_len_512": 4.3933230580473905, "train_eval/perplexity_len_512": 80.90883751188974} +{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6999.246100232936, "train_eval/train_update_time": 3905.29758988251, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.285216300610045, "train_eval/perplexity_len_2048": 72.61825239341289, "train_eval/loss_avg_len_1024": 4.308363595821029, "train_eval/perplexity_len_1024": 74.31877382787907, "train_eval/loss_avg_len_512": 4.353135664994989, "train_eval/perplexity_len_512": 77.72179072718198} +{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7550.323625019984, "train_eval/train_update_time": 4184.042193662957, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.247887878154143, "train_eval/perplexity_len_2048": 69.95749743590287, "train_eval/loss_avg_len_1024": 4.2724517078520154, "train_eval/perplexity_len_1024": 71.69720090430816, "train_eval/loss_avg_len_512": 4.320460617008357, "train_eval/perplexity_len_512": 75.22326943058576} +{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8010.43765124795, "train_eval/train_update_time": 4462.722712960327, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.220693094336503, "train_eval/perplexity_len_2048": 68.08065425680356, "train_eval/loss_avg_len_1024": 4.248167304437285, "train_eval/perplexity_len_1024": 69.97704813075109, "train_eval/loss_avg_len_512": 4.298565637756401, "train_eval/perplexity_len_512": 73.59415727671168} +{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8561.475588510977, "train_eval/train_update_time": 4741.406085017603, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.193676541369477, "train_eval/perplexity_len_2048": 66.2659732436429, "train_eval/loss_avg_len_1024": 4.216801415602568, "train_eval/perplexity_len_1024": 67.81622110062538, "train_eval/loss_avg_len_512": 4.269361495983503, "train_eval/perplexity_len_512": 71.47598334374155} +{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9022.126939511974, "train_eval/train_update_time": 5020.160163923283, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.187023427749736, "train_eval/perplexity_len_2048": 65.82656154335335, "train_eval/loss_avg_len_1024": 4.213506009909706, "train_eval/perplexity_len_1024": 67.5931069672473, "train_eval/loss_avg_len_512": 4.267473418653644, "train_eval/perplexity_len_512": 71.34115847989767} +{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9573.927157147904, "train_eval/train_update_time": 5298.920672355802, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.17461025829327, "train_eval/perplexity_len_2048": 65.01449586199058, "train_eval/loss_avg_len_1024": 4.2044485659670325, "train_eval/perplexity_len_1024": 66.9836504157636, "train_eval/loss_avg_len_512": 4.257914591990775, "train_eval/perplexity_len_512": 70.6624696143801} diff --git a/metrics/jsonlines/val.jsonl b/metrics/jsonlines/val.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..16e1efbb9dfbecc707fe0bd622de6014918e4130 --- /dev/null +++ b/metrics/jsonlines/val.jsonl @@ -0,0 +1,49 @@ +{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 115.38901872490533, "val/train_update_time": 115.08477980294265, "val/loss": 8.017322596772551, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.8957867800491, "val/val_tokens_per_second": 450625.9470432396, "val/loss_avg_len_2048": 8.017322596772551, "val/perplexity_len_2048": 3033.045765536961, "val/loss_avg_len_1024": 8.01611629340169, "val/perplexity_len_1024": 3029.389198114233, "val/loss_avg_len_512": 8.016581488862169, "val/perplexity_len_512": 3030.798784058288} +{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 318.01136298198253, "val/train_update_time": 226.56994250579737, "val/loss": 7.168800498851901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 91.53357388603035, "val/val_tokens_per_second": 447486.0781793556, "val/loss_avg_len_2048": 7.168800498851901, "val/perplexity_len_2048": 1298.286372446997, "val/loss_avg_len_1024": 7.16922726986399, "val/perplexity_len_1024": 1298.8405616836865, "val/loss_avg_len_512": 7.17253666183427, "val/perplexity_len_512": 1303.1460545614693} +{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 521.2494050179375, "val/train_update_time": 338.0269747101702, "val/loss": 6.681955375412503, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.73258669499774, "val/val_tokens_per_second": 451436.48486170854, "val/loss_avg_len_2048": 6.681955375412503, "val/perplexity_len_2048": 797.8777381810718, "val/loss_avg_len_1024": 6.683395165921888, "val/perplexity_len_1024": 799.0273423721623, "val/loss_avg_len_512": 6.689393609371596, "val/perplexity_len_512": 803.834666516683} +{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 723.6805678269593, "val/train_update_time": 449.4893446461065, "val/loss": 6.252993275996973, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 91.52609027700964, "val/val_tokens_per_second": 447522.66677219473, "val/loss_avg_len_2048": 6.252993275996973, "val/perplexity_len_2048": 519.5657029597602, "val/loss_avg_len_1024": 6.255989201308694, "val/perplexity_len_1024": 521.124617029408, "val/loss_avg_len_512": 6.264971070884261, "val/perplexity_len_512": 525.82637404726} +{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 926.8863875919487, "val/train_update_time": 560.9345708230976, "val/loss": 5.955217036969051, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.68943495198619, "val/val_tokens_per_second": 451651.2868526031, "val/loss_avg_len_2048": 5.955217036969051, "val/perplexity_len_2048": 385.76062591006854, "val/loss_avg_len_1024": 5.959280439781304, "val/perplexity_len_1024": 387.3313157338021, "val/loss_avg_len_512": 5.970304897817318, "val/perplexity_len_512": 391.6250581739251} +{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1129.7145008929074, "val/train_update_time": 672.3896088181064, "val/loss": 5.727587048024056, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.61158224998508, "val/val_tokens_per_second": 452039.34180287144, "val/loss_avg_len_2048": 5.727587048024056, "val/perplexity_len_2048": 307.227049152543, "val/loss_avg_len_1024": 5.732679790063948, "val/perplexity_len_1024": 308.79566815721904, "val/loss_avg_len_512": 5.74529507358959, "val/perplexity_len_512": 312.7158884172708} +{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1332.0184532779967, "val/train_update_time": 783.8552743501496, "val/loss": 5.5462320905009985, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.99034959904384, "val/val_tokens_per_second": 450157.6285891138, "val/loss_avg_len_2048": 5.5462320905009985, "val/perplexity_len_2048": 256.2701318031649, "val/loss_avg_len_1024": 5.552394144184422, "val/perplexity_len_1024": 257.8541575263273, "val/loss_avg_len_512": 5.566559319449496, "val/perplexity_len_512": 261.5326989437206} +{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1534.7253116948996, "val/train_update_time": 895.3389847053913, "val/loss": 5.397562808819162, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.53677860402968, "val/val_tokens_per_second": 452412.82748905895, "val/loss_avg_len_2048": 5.397562808819162, "val/perplexity_len_2048": 220.86746347143392, "val/loss_avg_len_1024": 5.404754845878948, "val/perplexity_len_1024": 222.4616764014897, "val/loss_avg_len_512": 5.420072977858782, "val/perplexity_len_512": 225.8956072782415} +{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1736.965916060959, "val/train_update_time": 1006.818993799272, "val/loss": 5.266776608066797, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.56331127206795, "val/val_tokens_per_second": 452280.28243080724, "val/loss_avg_len_2048": 5.266776608066797, "val/perplexity_len_2048": 193.79029253177575, "val/loss_avg_len_1024": 5.274747739335196, "val/perplexity_len_1024": 195.34119339740653, "val/loss_avg_len_512": 5.2913418367584235, "val/perplexity_len_512": 198.6097485495165} +{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1939.2464721049182, "val/train_update_time": 1118.2928310850402, "val/loss": 5.159342473128369, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51000242400914, "val/val_tokens_per_second": 452546.6678049137, "val/loss_avg_len_2048": 5.159342473128369, "val/perplexity_len_2048": 174.04997543648554, "val/loss_avg_len_1024": 5.168076667023404, "val/perplexity_len_1024": 175.57681984049975, "val/loss_avg_len_512": 5.186009379204084, "val/perplexity_len_512": 178.75378911651651} +{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2141.922117186943, "val/train_update_time": 1229.7717256471515, "val/loss": 5.0615160507458965, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.53271648404188, "val/val_tokens_per_second": 452433.12683785404, "val/loss_avg_len_2048": 5.0615160507458965, "val/perplexity_len_2048": 157.82961273853934, "val/loss_avg_len_1024": 5.071083297332656, "val/perplexity_len_1024": 159.34685389944232, "val/loss_avg_len_512": 5.090461814133264, "val/perplexity_len_512": 162.46487330617947} +{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2344.147989144898, "val/train_update_time": 1341.2354510270525, "val/loss": 4.984231026970013, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 91.01652396703139, "val/val_tokens_per_second": 450028.1730692858, "val/loss_avg_len_2048": 4.984231026970013, "val/perplexity_len_2048": 146.09119163551048, "val/loss_avg_len_1024": 4.994521487932326, "val/perplexity_len_1024": 147.60229900095914, "val/loss_avg_len_512": 5.01494721182771, "val/perplexity_len_512": 150.64818411371755} +{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2546.883552026935, "val/train_update_time": 1452.7241989910835, "val/loss": 4.916793563892343, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51194408303127, "val/val_tokens_per_second": 452536.9597898072, "val/loss_avg_len_2048": 4.916793563892343, "val/perplexity_len_2048": 136.56402658642736, "val/loss_avg_len_1024": 4.927482071626792, "val/perplexity_len_1024": 138.03152093214322, "val/loss_avg_len_512": 4.948590329375863, "val/perplexity_len_512": 140.97609392550117} +{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2749.1005825489992, "val/train_update_time": 1564.2029820160242, "val/loss": 4.860561983227101, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51377508195583, "val/val_tokens_per_second": 452527.805440804, "val/loss_avg_len_2048": 4.860561983227101, "val/perplexity_len_2048": 129.09673192321375, "val/loss_avg_len_1024": 4.871906842135312, "val/perplexity_len_1024": 130.5696553890599, "val/loss_avg_len_512": 4.894014172679093, "val/perplexity_len_512": 133.4883452697447} +{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 2951.315687590977, "val/train_update_time": 1675.687991371029, "val/loss": 4.811751918288158, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51711130701005, "val/val_tokens_per_second": 452511.12644408783, "val/loss_avg_len_2048": 4.811751918288158, "val/perplexity_len_2048": 122.94682173759111, "val/loss_avg_len_1024": 4.823616650856426, "val/perplexity_len_1024": 124.4142409516568, "val/loss_avg_len_512": 4.846837456004229, "val/perplexity_len_512": 127.33704337975502} +{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3153.9731954459567, "val/train_update_time": 1787.1771466861246, "val/loss": 4.759777876929636, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.7631790220039, "val/val_tokens_per_second": 451284.32522256614, "val/loss_avg_len_2048": 4.759777876929636, "val/perplexity_len_2048": 116.71999681530721, "val/loss_avg_len_1024": 4.771922723843483, "val/perplexity_len_1024": 118.14618620443298, "val/loss_avg_len_512": 4.795630532579589, "val/perplexity_len_512": 120.98063997389404} +{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3356.450910591986, "val/train_update_time": 1898.6706604874926, "val/loss": 4.716705958361551, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.57427461701445, "val/val_tokens_per_second": 452225.53725321946, "val/loss_avg_len_2048": 4.716705958361551, "val/perplexity_len_2048": 111.79937365796825, "val/loss_avg_len_1024": 4.729562304343749, "val/perplexity_len_1024": 113.24598422400375, "val/loss_avg_len_512": 4.754220814034343, "val/perplexity_len_512": 116.07317533024096} +{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 3558.7265775619308, "val/train_update_time": 2010.1547237539198, "val/loss": 4.67779298429757, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.67017155699432, "val/val_tokens_per_second": 451747.24274402607, "val/loss_avg_len_2048": 4.67779298429757, "val/perplexity_len_2048": 107.53248460560826, "val/loss_avg_len_1024": 4.691213681914146, "val/perplexity_len_1024": 108.98537314715132, "val/loss_avg_len_512": 4.716703379452508, "val/perplexity_len_512": 111.79908533792432} +{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 3761.109741097898, "val/train_update_time": 2121.6608164007775, "val/loss": 4.640750338151609, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.67889700899832, "val/val_tokens_per_second": 451703.7739876283, "val/loss_avg_len_2048": 4.640750338151609, "val/perplexity_len_2048": 103.62207001306183, "val/loss_avg_len_1024": 4.654702061567177, "val/perplexity_len_1024": 105.07790858728595, "val/loss_avg_len_512": 4.681117743289098, "val/perplexity_len_512": 107.89059919314577} +{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 3963.4901722619543, "val/train_update_time": 2233.1504810712067, "val/loss": 4.607613671057416, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.60668137599714, "val/val_tokens_per_second": 452063.79240428534, "val/loss_avg_len_2048": 4.607613671057416, "val/perplexity_len_2048": 100.24464728119722, "val/loss_avg_len_1024": 4.6223377861871855, "val/perplexity_len_1024": 101.73158103702862, "val/loss_avg_len_512": 4.650039090201073, "val/perplexity_len_512": 104.58907390513586} +{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4166.242571576964, "val/train_update_time": 2344.6231456701644, "val/loss": 4.57813637526834, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.58338582294527, "val/val_tokens_per_second": 452180.0507662698, "val/loss_avg_len_2048": 4.57813637526834, "val/perplexity_len_2048": 97.33283320365324, "val/loss_avg_len_1024": 4.593541098429355, "val/perplexity_len_1024": 98.84382689289926, "val/loss_avg_len_512": 4.622042558031157, "val/perplexity_len_512": 101.70155144295813} +{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4368.526084997924, "val/train_update_time": 2456.1160258102464, "val/loss": 4.545171438023285, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.74309363192879, "val/val_tokens_per_second": 451384.2140553587, "val/loss_avg_len_2048": 4.545171438023285, "val/perplexity_len_2048": 94.17657127073271, "val/loss_avg_len_1024": 4.560983082159796, "val/perplexity_len_1024": 95.67749244735029, "val/loss_avg_len_512": 4.5904487324811525, "val/perplexity_len_512": 98.53863772991662} +{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 4570.982028090977, "val/train_update_time": 2567.596293253242, "val/loss": 4.517566748579941, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.59338419197593, "val/val_tokens_per_second": 452130.14576430764, "val/loss_avg_len_2048": 4.517566748579941, "val/perplexity_len_2048": 91.61241052467754, "val/loss_avg_len_1024": 4.5339546799753325, "val/perplexity_len_1024": 93.12611781174976, "val/loss_avg_len_512": 4.564446135131549, "val/perplexity_len_512": 96.00940305219359} +{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 4773.292612125981, "val/train_update_time": 2679.07335356588, "val/loss": 4.490669789551548, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.61795308801811, "val/val_tokens_per_second": 452007.5614620775, "val/loss_avg_len_2048": 4.490669789551548, "val/perplexity_len_2048": 89.18115848713532, "val/loss_avg_len_1024": 4.507707643355197, "val/perplexity_len_1024": 90.71363198575763, "val/loss_avg_len_512": 4.539340242150892, "val/perplexity_len_512": 93.62900726510776} +{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 4975.63103498891, "val/train_update_time": 2790.5577287059277, "val/loss": 4.4664931430107915, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.63011032703798, "val/val_tokens_per_second": 451946.92858914315, "val/loss_avg_len_2048": 4.4664931430107915, "val/perplexity_len_2048": 87.05091200992092, "val/loss_avg_len_1024": 4.484059858988878, "val/perplexity_len_1024": 88.59362115171373, "val/loss_avg_len_512": 4.5166467366272585, "val/perplexity_len_512": 91.52816477147304} +{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5178.418338532909, "val/train_update_time": 2902.025444978848, "val/loss": 4.441390738584008, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.61769351002295, "val/val_tokens_per_second": 452008.8562557547, "val/loss_avg_len_2048": 4.441390738584008, "val/perplexity_len_2048": 84.8929234780783, "val/loss_avg_len_1024": 4.459845923438808, "val/perplexity_len_1024": 86.47418442487788, "val/loss_avg_len_512": 4.493771929252893, "val/perplexity_len_512": 89.45824045076729} +{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 5380.720167659922, "val/train_update_time": 3013.4729040842503, "val/loss": 4.420545830946578, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.53602080291603, "val/val_tokens_per_second": 452416.6142574795, "val/loss_avg_len_2048": 4.420545830946578, "val/perplexity_len_2048": 83.14165426318776, "val/loss_avg_len_1024": 4.439822294117836, "val/perplexity_len_1024": 84.75987800652325, "val/loss_avg_len_512": 4.474853580821212, "val/perplexity_len_512": 87.78174653751046} +{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 5582.958667394938, "val/train_update_time": 3124.937779762433, "val/loss": 4.39603335383758, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.5017854050966, "val/val_tokens_per_second": 452587.75632611266, "val/loss_avg_len_2048": 4.39603335383758, "val/perplexity_len_2048": 81.12842182839486, "val/loss_avg_len_1024": 4.415969875032036, "val/perplexity_len_1024": 82.762070859431, "val/loss_avg_len_512": 4.452327949493192, "val/perplexity_len_512": 85.82651140438335} +{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 5785.135669226991, "val/train_update_time": 3236.387389887357, "val/loss": 4.3746475970230305, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.56642184499651, "val/val_tokens_per_second": 452264.74851907714, "val/loss_avg_len_2048": 4.3746475970230305, "val/perplexity_len_2048": 79.41184964846552, "val/loss_avg_len_1024": 4.395640490190779, "val/perplexity_len_1024": 81.09655568067153, "val/loss_avg_len_512": 4.4339910840201195, "val/perplexity_len_512": 84.26706359861659} +{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 5987.366702231928, "val/train_update_time": 3347.8448207870824, "val/loss": 4.355698767514504, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.80764411704149, "val/val_tokens_per_second": 451063.34822657524, "val/loss_avg_len_2048": 4.355698767514504, "val/perplexity_len_2048": 77.9212551595122, "val/loss_avg_len_1024": 4.377580918695685, "val/perplexity_len_1024": 79.64513212710933, "val/loss_avg_len_512": 4.417377031392325, "val/perplexity_len_512": 82.87861201041235} +{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6190.319736288977, "val/train_update_time": 3459.312256404897, "val/loss": 4.333859468727117, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.45447246590629, "val/val_tokens_per_second": 452824.4859914303, "val/loss_avg_len_2048": 4.333859468727117, "val/perplexity_len_2048": 76.23795751058033, "val/loss_avg_len_1024": 4.35671562793199, "val/perplexity_len_1024": 78.00053049871111, "val/loss_avg_len_512": 4.398111169292033, "val/perplexity_len_512": 81.2971669669832} +{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 6392.506873650011, "val/train_update_time": 3570.8129316339036, "val/loss": 4.315572682284401, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48178494896274, "val/val_tokens_per_second": 452687.7981364309, "val/loss_avg_len_2048": 4.315572682284401, "val/perplexity_len_2048": 74.85648015118835, "val/loss_avg_len_1024": 4.339295653387905, "val/perplexity_len_1024": 76.65352966583826, "val/loss_avg_len_512": 4.382252874559444, "val/perplexity_len_512": 80.01810123837882} +{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 6594.719191052951, "val/train_update_time": 3682.304503756459, "val/loss": 4.299363855137489, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.58356982597616, "val/val_tokens_per_second": 452179.1322498103, "val/loss_avg_len_2048": 4.299363855137489, "val/perplexity_len_2048": 73.65292486373085, "val/loss_avg_len_1024": 4.323835089938761, "val/perplexity_len_1024": 75.47753708539467, "val/loss_avg_len_512": 4.368103061350993, "val/perplexity_len_512": 78.89383290411739} +{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 6797.037191045936, "val/train_update_time": 3793.7997314956738, "val/loss": 4.2817527375663165, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48214890307281, "val/val_tokens_per_second": 452685.97725146404, "val/loss_avg_len_2048": 4.2817527375663165, "val/perplexity_len_2048": 72.36716956972212, "val/loss_avg_len_1024": 4.30717062690584, "val/perplexity_len_1024": 74.23016670416823, "val/loss_avg_len_512": 4.353055857041944, "val/perplexity_len_512": 77.71558815766758} +{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 6999.246100232936, "val/train_update_time": 3905.29758988251, "val/loss": 4.266338244345109, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.41607927205041, "val/val_tokens_per_second": 453016.76792196004, "val/loss_avg_len_2048": 4.266338244345109, "val/perplexity_len_2048": 71.26021977810917, "val/loss_avg_len_1024": 4.292485382087575, "val/perplexity_len_1024": 73.14804360172089, "val/loss_avg_len_512": 4.339678274843562, "val/perplexity_len_512": 76.68286456266289} +{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 7201.8562647239305, "val/train_update_time": 4016.812845747103, "val/loss": 4.252442864475725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.41256658593193, "val/val_tokens_per_second": 453034.36841459293, "val/loss_avg_len_2048": 4.252442864475725, "val/perplexity_len_2048": 70.27687971857087, "val/loss_avg_len_1024": 4.279593376897369, "val/perplexity_len_1024": 72.2110713468667, "val/loss_avg_len_512": 4.328346583297849, "val/perplexity_len_512": 75.81882276789578} +{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 7404.001054160995, "val/train_update_time": 4128.298564241966, "val/loss": 4.239966365041072, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.46125932002906, "val/val_tokens_per_second": 452790.512843668, "val/loss_avg_len_2048": 4.239966365041072, "val/perplexity_len_2048": 69.40551734776977, "val/loss_avg_len_1024": 4.2677443491777405, "val/perplexity_len_1024": 71.36048959593073, "val/loss_avg_len_512": 4.317693537768815, "val/perplexity_len_512": 75.015408400013} +{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 7606.181935134926, "val/train_update_time": 4239.792019490269, "val/loss": 4.228949285643083, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.43556012096815, "val/val_tokens_per_second": 452919.18295426277, "val/loss_avg_len_2048": 4.228949285643083, "val/perplexity_len_2048": 68.64506791008688, "val/loss_avg_len_1024": 4.257730329607474, "val/perplexity_len_1024": 70.64945037883355, "val/loss_avg_len_512": 4.309273659745045, "val/perplexity_len_512": 74.38643944816678} +{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 7808.319425186957, "val/train_update_time": 4351.274368522107, "val/loss": 4.218897107815556, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.45598102302756, "val/val_tokens_per_second": 452816.93412371184, "val/loss_avg_len_2048": 4.218897107815556, "val/perplexity_len_2048": 67.95849205309356, "val/loss_avg_len_1024": 4.248032534746873, "val/perplexity_len_1024": 69.96761798110202, "val/loss_avg_len_512": 4.300288976043649, "val/perplexity_len_512": 73.72109425192328} +{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 8010.43765124795, "val/train_update_time": 4462.722712960327, "val/loss": 4.210539670823771, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.42771953193005, "val/val_tokens_per_second": 452958.453580564, "val/loss_avg_len_2048": 4.210539670823771, "val/perplexity_len_2048": 67.39289997982114, "val/loss_avg_len_1024": 4.2402994022696285, "val/perplexity_len_1024": 69.42863581835583, "val/loss_avg_len_512": 4.293656023380068, "val/perplexity_len_512": 73.23372386266715} +{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 8212.996373801958, "val/train_update_time": 4574.1805559834465, "val/loss": 4.2029221291968835, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.4471801869804, "val/val_tokens_per_second": 452860.99484056735, "val/loss_avg_len_2048": 4.2029221291968835, "val/perplexity_len_2048": 66.88148210534082, "val/loss_avg_len_1024": 4.232967852277168, "val/perplexity_len_1024": 68.92147770319531, "val/loss_avg_len_512": 4.28687209741557, "val/perplexity_len_512": 72.7385930661049} +{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 8415.15657795791, "val/train_update_time": 4685.668182700407, "val/loss": 4.196753646510444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.46505224099383, "val/val_tokens_per_second": 452771.5287322762, "val/loss_avg_len_2048": 4.196753646510444, "val/perplexity_len_2048": 66.4701946548187, "val/loss_avg_len_1024": 4.227167845540075, "val/perplexity_len_1024": 68.5228896921381, "val/loss_avg_len_512": 4.281677430549357, "val/perplexity_len_512": 72.36172001925331} +{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 8617.329412178951, "val/train_update_time": 4797.150553135551, "val/loss": 4.191815680753463, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.45923761592712, "val/val_tokens_per_second": 452800.63241200906, "val/loss_avg_len_2048": 4.191815680753463, "val/perplexity_len_2048": 66.1427761656977, "val/loss_avg_len_1024": 4.222529412544612, "val/perplexity_len_1024": 68.20578685830964, "val/loss_avg_len_512": 4.277603877550364, "val/perplexity_len_512": 72.06755028244689} +{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 8819.515494428924, "val/train_update_time": 4908.649304769933, "val/loss": 4.187887623292348, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.85943013709038, "val/val_tokens_per_second": 450806.2612565234, "val/loss_avg_len_2048": 4.187887623292348, "val/perplexity_len_2048": 65.88347315226201, "val/loss_avg_len_1024": 4.218744200249576, "val/perplexity_len_1024": 67.94810147990495, "val/loss_avg_len_512": 4.274144274820015, "val/perplexity_len_512": 71.81865597468175} +{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 9022.126939511974, "val/train_update_time": 5020.160163923283, "val/loss": 4.1849501522684704, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.70509503502399, "val/val_tokens_per_second": 451573.3100128951, "val/loss_avg_len_2048": 4.1849501522684704, "val/perplexity_len_2048": 65.69022632635689, "val/loss_avg_len_1024": 4.216048343973933, "val/perplexity_len_1024": 67.7651698535888, "val/loss_avg_len_512": 4.271848099075444, "val/perplexity_len_512": 71.6539369031657} +{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 9224.990330599947, "val/train_update_time": 5131.661126127117, "val/loss": 4.182838360282803, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.65441882296, "val/val_tokens_per_second": 451825.74144555745, "val/loss_avg_len_2048": 4.182838360282803, "val/perplexity_len_2048": 65.55164860802255, "val/loss_avg_len_1024": 4.214113749123505, "val/perplexity_len_1024": 67.63419843411694, "val/loss_avg_len_512": 4.270228833553196, "val/perplexity_len_512": 71.53800404197908} +{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 9427.379897051957, "val/train_update_time": 5243.171735763899, "val/loss": 4.181507043302083, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.67323047306854, "val/val_tokens_per_second": 451732.00277854665, "val/loss_avg_len_2048": 4.181507043302083, "val/perplexity_len_2048": 65.46443665137711, "val/loss_avg_len_1024": 4.212834281807114, "val/perplexity_len_1024": 67.54771802397694, "val/loss_avg_len_512": 4.269042033359408, "val/perplexity_len_512": 71.45315308544407} +{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 9629.798734208918, "val/train_update_time": 5354.673169916612, "val/loss": 4.1807237462811635, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.82715408504009, "val/val_tokens_per_second": 450966.45835285966, "val/loss_avg_len_2048": 4.1807237462811635, "val/perplexity_len_2048": 65.4131786309201, "val/loss_avg_len_1024": 4.212053486323543, "val/perplexity_len_1024": 67.49499765540955, "val/loss_avg_len_512": 4.268303510677349, "val/perplexity_len_512": 71.40040279222738} +{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 9832.331430819933, "val/train_update_time": 5466.160342909512, "val/loss": 4.180392267811508, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.65315544593614, "val/val_tokens_per_second": 451832.0382617877, "val/loss_avg_len_2048": 4.180392267811508, "val/perplexity_len_2048": 65.39149916390902, "val/loss_avg_len_1024": 4.211774798628548, "val/perplexity_len_1024": 67.47619025090711, "val/loss_avg_len_512": 4.26806066927705, "val/perplexity_len_512": 71.38306592357135} diff --git a/metrics/jsonlines/val_data_info.jsonl b/metrics/jsonlines/val_data_info.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8d2a41d2d94f5b2005b74e9163cd291dacf51e5d --- /dev/null +++ b/metrics/jsonlines/val_data_info.jsonl @@ -0,0 +1 @@ +{"step": 0, "val_data_info/vocab_size": 50277, "val_data_info/global_tokens_per_batch": 2048, "val_data_info/local_tokens_per_batch": 2048, "val_data_info/batch_len": 2048, "val_data_info/seq_len": 2048, "val_data_info/total_tokens": 2147483648, "val_data_info/global_batch_size": 1, "val_data_info/local_batch_size": 1} diff --git a/metrics/npz/train_eval/step-000000104857600.npz b/metrics/npz/train_eval/step-000000104857600.npz new file mode 100644 index 0000000000000000000000000000000000000000..482628f2641c3775cbeef42634c6a8e3a3b4b21c --- /dev/null +++ b/metrics/npz/train_eval/step-000000104857600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5edbc3e504381f9fc3799b6feeff24af725eb43cc535c19ea5f9d9c54b3ac325 +size 20540 diff --git a/metrics/npz/train_eval/step-000000209715200.npz b/metrics/npz/train_eval/step-000000209715200.npz new file mode 100644 index 0000000000000000000000000000000000000000..1964c8a4a610b919ab609c98c8544a873948d526 --- /dev/null +++ b/metrics/npz/train_eval/step-000000209715200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73d9de1b64eb9c0715c80d1997ae4bd0e5e8c7af00a76ff6215ba7ff8bf5ac66 +size 20540 diff --git a/metrics/npz/train_eval/step-000000314572800.npz b/metrics/npz/train_eval/step-000000314572800.npz new file mode 100644 index 0000000000000000000000000000000000000000..e645278739d64699f61d11f926f2d74e95330d24 --- /dev/null +++ b/metrics/npz/train_eval/step-000000314572800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70180d7b7541c0cf7a687ecc7d6a84011589df1275efc744d4d677a11eace5a5 +size 20540 diff --git a/metrics/npz/train_eval/step-000000419430400.npz b/metrics/npz/train_eval/step-000000419430400.npz new file mode 100644 index 0000000000000000000000000000000000000000..f566358077f22f706e8595a4e84147f79b67751c --- /dev/null +++ b/metrics/npz/train_eval/step-000000419430400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f893e858cb0c3bd798d509e0174d4b84ad746c7b75a0fd6aedbc7bab5da33ade +size 20540 diff --git a/metrics/npz/train_eval/step-000000524288000.npz b/metrics/npz/train_eval/step-000000524288000.npz new file mode 100644 index 0000000000000000000000000000000000000000..f523d841ae757a19643328102bd68dd201dbc0b2 --- /dev/null +++ b/metrics/npz/train_eval/step-000000524288000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e6709458b20220f5936f11cd683644dbe6097e02b830d0e348a286e4de0fce +size 20540 diff --git a/metrics/npz/train_eval/step-000000629145600.npz b/metrics/npz/train_eval/step-000000629145600.npz new file mode 100644 index 0000000000000000000000000000000000000000..f382685a164a9419e5eccfae2c5b953755cac33f --- /dev/null +++ b/metrics/npz/train_eval/step-000000629145600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea451aeb881837b428d9f86d60ef9be59f4ead3dc6ea7860bba27235d172fa34 +size 20540 diff --git a/metrics/npz/train_eval/step-000000734003200.npz b/metrics/npz/train_eval/step-000000734003200.npz new file mode 100644 index 0000000000000000000000000000000000000000..aab125767ca42d71c5ea0d3f6c5080d9d913267c --- /dev/null +++ b/metrics/npz/train_eval/step-000000734003200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:051593b9ecff5ee5272e4905f5b7e9dfde584d63d1bd980dd0016e653453c521 +size 20540 diff --git a/metrics/npz/train_eval/step-000000838860800.npz b/metrics/npz/train_eval/step-000000838860800.npz new file mode 100644 index 0000000000000000000000000000000000000000..9ad6aec3eeef7f50654c048dd5adf1ccf6e02ee9 --- /dev/null +++ b/metrics/npz/train_eval/step-000000838860800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a7a8a1591c14db6ad8e58c2f55bb3d145e095b2cb3ee9f32b0d775d5cb8af8 +size 20540 diff --git a/metrics/npz/train_eval/step-000000943718400.npz b/metrics/npz/train_eval/step-000000943718400.npz new file mode 100644 index 0000000000000000000000000000000000000000..b7e9c00702b34e9acad98eeb167a38d9cf47fc29 --- /dev/null +++ b/metrics/npz/train_eval/step-000000943718400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225d3328c97269ad52002d6696e55b7f38b9d0a3bf5156893baf3f58527a7717 +size 20540 diff --git a/metrics/npz/train_eval/step-000001048576000.npz b/metrics/npz/train_eval/step-000001048576000.npz new file mode 100644 index 0000000000000000000000000000000000000000..ddb4974086b6864cce82c4a4ada0b1ad913bf58c --- /dev/null +++ b/metrics/npz/train_eval/step-000001048576000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b2053fdb1eae5e3c9dab60edc3cc5e528d7d053177005b9d579eae510debac5 +size 20540 diff --git a/metrics/npz/train_eval/step-000001153433600.npz b/metrics/npz/train_eval/step-000001153433600.npz new file mode 100644 index 0000000000000000000000000000000000000000..774a9f9eb36bd3c33cd4b748737035e377c47b04 --- /dev/null +++ b/metrics/npz/train_eval/step-000001153433600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:755a20cdf5f7844d6962857b3db8ef0ffc8b890190cdbd8dd8127a73280a64c9 +size 20540 diff --git a/metrics/npz/train_eval/step-000001258291200.npz b/metrics/npz/train_eval/step-000001258291200.npz new file mode 100644 index 0000000000000000000000000000000000000000..8fd8beb4abc7702c9f799f22b6af8b1f9a90e278 --- /dev/null +++ b/metrics/npz/train_eval/step-000001258291200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7b974770a4fd5fdc590d66a2ca782c4737d62b5401bbd2d04c3c62c488fb154 +size 20540 diff --git a/metrics/npz/train_eval/step-000001363148800.npz b/metrics/npz/train_eval/step-000001363148800.npz new file mode 100644 index 0000000000000000000000000000000000000000..9ab5fd76e3224906613aad38b1ab0e64cad109f2 --- /dev/null +++ b/metrics/npz/train_eval/step-000001363148800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcb28c86ebbb0d4f571e7f9959c74e7661bb281bca3c14cd55ec76c187dc0747 +size 20540 diff --git a/metrics/npz/train_eval/step-000001468006400.npz b/metrics/npz/train_eval/step-000001468006400.npz new file mode 100644 index 0000000000000000000000000000000000000000..d11e53d93283ef344342cacc122402b7a5557fc1 --- /dev/null +++ b/metrics/npz/train_eval/step-000001468006400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:603d927206f3f46097fe560058ae29f9920e32393ee6fc44bbeb6d6044d40f8f +size 20540 diff --git a/metrics/npz/train_eval/step-000001572864000.npz b/metrics/npz/train_eval/step-000001572864000.npz new file mode 100644 index 0000000000000000000000000000000000000000..4c122eb308641f080e01b5dd4a7b9ae6363fecda --- /dev/null +++ b/metrics/npz/train_eval/step-000001572864000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:977ccda41e55d7909257133321a4487d933c853ab4987e2b772725fe784e361b +size 20540 diff --git a/metrics/npz/train_eval/step-000001677721600.npz b/metrics/npz/train_eval/step-000001677721600.npz new file mode 100644 index 0000000000000000000000000000000000000000..eb195caf5991252ea871bffbc12a16ab67426884 --- /dev/null +++ b/metrics/npz/train_eval/step-000001677721600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8fb5bdcbaef7f1dbd2897f70fa5bb6ec5878ac7250de920bf1b96af428f9a75 +size 20540 diff --git a/metrics/npz/train_eval/step-000001782579200.npz b/metrics/npz/train_eval/step-000001782579200.npz new file mode 100644 index 0000000000000000000000000000000000000000..de7cd0ed854bb9dcb6fbd98985c28b36691fd564 --- /dev/null +++ b/metrics/npz/train_eval/step-000001782579200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5603085ded395eae51caa3cd21baa751c1e203334b84876e690a13243c0f70c3 +size 20540 diff --git a/metrics/npz/train_eval/step-000001887436800.npz b/metrics/npz/train_eval/step-000001887436800.npz new file mode 100644 index 0000000000000000000000000000000000000000..cd1434d63db5c2f2877c95b874d2b67f64a565e3 --- /dev/null +++ b/metrics/npz/train_eval/step-000001887436800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103ed4f6e1c205fabe560b534e496a0007b8ab2244f0bf8696f0d2ba5fb81571 +size 20540 diff --git a/metrics/npz/train_eval/step-000001992294400.npz b/metrics/npz/train_eval/step-000001992294400.npz new file mode 100644 index 0000000000000000000000000000000000000000..4242fce0dc2b7f6c310ed725642b459d939f91ec --- /dev/null +++ b/metrics/npz/train_eval/step-000001992294400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb5c2f599eb410262d4b2b8d060dc2854147206566f5a8d7a284864bde0050f3 +size 20540 diff --git a/metrics/npz/val/step-000000041943040.npz b/metrics/npz/val/step-000000041943040.npz new file mode 100644 index 0000000000000000000000000000000000000000..1ba84ade9007c3ba2405b12236ac8b42a8e142a8 --- /dev/null +++ b/metrics/npz/val/step-000000041943040.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b31705945b206887579a2f32f502994e5eec2cbedc32712d55845fccc796317 +size 21142 diff --git a/metrics/npz/val/step-000000083886080.npz b/metrics/npz/val/step-000000083886080.npz new file mode 100644 index 0000000000000000000000000000000000000000..124b637ee3a14cbfc06e412937875449a499ba37 --- /dev/null +++ b/metrics/npz/val/step-000000083886080.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e8d258157d92292d26fed1e94d60fe52be847043cc500b7dd4f235d405530b3 +size 21142 diff --git a/metrics/npz/val/step-000000125829120.npz b/metrics/npz/val/step-000000125829120.npz new file mode 100644 index 0000000000000000000000000000000000000000..36cc6006a9aca5d5974a8bb20888ec42fa0d4a24 --- /dev/null +++ b/metrics/npz/val/step-000000125829120.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff04544ccc6b5a6cce8d1e8c7df20578f07fd17d73f4f2ebeb21a5b91227dae1 +size 21142 diff --git a/metrics/npz/val/step-000000167772160.npz b/metrics/npz/val/step-000000167772160.npz new file mode 100644 index 0000000000000000000000000000000000000000..53b76e808564ce0f522d41bed12cffa1357545bd --- /dev/null +++ b/metrics/npz/val/step-000000167772160.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c2f69500bb0250577fa0888291e7cfb87e24c85abd01e95c184c273e7a331ab +size 21142 diff --git a/metrics/npz/val/step-000000209715200.npz b/metrics/npz/val/step-000000209715200.npz new file mode 100644 index 0000000000000000000000000000000000000000..f2c643d3a52366224d9f57297edfa490053a93e2 --- /dev/null +++ b/metrics/npz/val/step-000000209715200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eaa1557a638ae0f39bf544e5a81d4d61215261fd11f3dcb4e88a028ef2e07c0 +size 21142 diff --git a/metrics/npz/val/step-000000251658240.npz b/metrics/npz/val/step-000000251658240.npz new file mode 100644 index 0000000000000000000000000000000000000000..e49e02b159a95f6e789f03fc7bfe4d79f7bd8fe9 --- /dev/null +++ b/metrics/npz/val/step-000000251658240.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f4cc7054fd8f9cbbe676d0c86e696057946aa909163af1e47d82ba4c6693834 +size 21142 diff --git a/metrics/npz/val/step-000000293601280.npz b/metrics/npz/val/step-000000293601280.npz new file mode 100644 index 0000000000000000000000000000000000000000..0ee75bfbc74fcb314640f51c5925abbf2fd05b53 --- /dev/null +++ b/metrics/npz/val/step-000000293601280.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5c1faec2d34145cd3ff966023f647f5416f10f4af93386c26c7398f20cdae9 +size 21142 diff --git a/metrics/npz/val/step-000000335544320.npz b/metrics/npz/val/step-000000335544320.npz new file mode 100644 index 0000000000000000000000000000000000000000..80de774d09b250b433532290708ec41a8e0af942 --- /dev/null +++ b/metrics/npz/val/step-000000335544320.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d3238aad3ad904343ec69a56748aa4006e8bd74cd1362c7933bb09430f61e41 +size 21142 diff --git a/metrics/npz/val/step-000000377487360.npz b/metrics/npz/val/step-000000377487360.npz new file mode 100644 index 0000000000000000000000000000000000000000..5fe01e889db309a3bf81b36ce89b595e1e48ddfb --- /dev/null +++ b/metrics/npz/val/step-000000377487360.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b62e66d6a272f81e4c853d1752f69f04dd570cc910f50651f4787982fbc2d1 +size 21142 diff --git a/metrics/npz/val/step-000000419430400.npz b/metrics/npz/val/step-000000419430400.npz new file mode 100644 index 0000000000000000000000000000000000000000..2369c46b9ae372fa8458c8788eb20df0b3828562 --- /dev/null +++ b/metrics/npz/val/step-000000419430400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8077ee538c294547151c22e87f61273bd09d67d7d5e6477122065f23afcb3ca2 +size 21142 diff --git a/metrics/npz/val/step-000000461373440.npz b/metrics/npz/val/step-000000461373440.npz new file mode 100644 index 0000000000000000000000000000000000000000..aeccfa256f255090a2093e46782356ad55ed9df0 --- /dev/null +++ b/metrics/npz/val/step-000000461373440.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d245da26d075971f32066e30188be264e75c3caaef5bf77a63a02bdfb74b6edb +size 21142 diff --git a/metrics/npz/val/step-000000503316480.npz b/metrics/npz/val/step-000000503316480.npz new file mode 100644 index 0000000000000000000000000000000000000000..bdd4f89dad9e9b0cf081a6e8e4946aeef883ee39 --- /dev/null +++ b/metrics/npz/val/step-000000503316480.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd40c6b10511911c0bc2c716ac1850ce0c4ed40a467e6a75c8fa6967b01f038c +size 21142 diff --git a/metrics/npz/val/step-000000545259520.npz b/metrics/npz/val/step-000000545259520.npz new file mode 100644 index 0000000000000000000000000000000000000000..0a8d9618de8b02d5f3ba776527762ee4cd8ee34c --- /dev/null +++ b/metrics/npz/val/step-000000545259520.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b03755cdb02635f0c9374fa03ad78afcf1a36104e8953673d2218d81e35603e3 +size 21142 diff --git a/metrics/npz/val/step-000000587202560.npz b/metrics/npz/val/step-000000587202560.npz new file mode 100644 index 0000000000000000000000000000000000000000..de5db9f3cdea468636af3a604ff4fbadffac435c --- /dev/null +++ b/metrics/npz/val/step-000000587202560.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5811e74db8a8f034ce837d3fb4be289484b055d200905001ad6de432dcd4ffa8 +size 21142 diff --git a/metrics/npz/val/step-000000629145600.npz b/metrics/npz/val/step-000000629145600.npz new file mode 100644 index 0000000000000000000000000000000000000000..289dfa4522c264b73a037339f55ce4619f6352d4 --- /dev/null +++ b/metrics/npz/val/step-000000629145600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d85ffdd4e9f80b7a2ef66f64ef2f19c09ec2de7a94f6878d16a6803030c3d348 +size 21142 diff --git a/metrics/npz/val/step-000000671088640.npz b/metrics/npz/val/step-000000671088640.npz new file mode 100644 index 0000000000000000000000000000000000000000..5f88e5aacf8645973dbf7caa63f0484571abf67b --- /dev/null +++ b/metrics/npz/val/step-000000671088640.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39fde6eb409465daaa8182d17200c741fc3575b687296a4746f0e8c29ec99b68 +size 21142 diff --git a/metrics/npz/val/step-000000713031680.npz b/metrics/npz/val/step-000000713031680.npz new file mode 100644 index 0000000000000000000000000000000000000000..68c1a0de3ea0dac9cd3372a93251617762e5536f --- /dev/null +++ b/metrics/npz/val/step-000000713031680.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:197a34a9bde84eb99feb800070f55935148edb5c11eac8a5c48941d234d78d12 +size 21142 diff --git a/metrics/npz/val/step-000000754974720.npz b/metrics/npz/val/step-000000754974720.npz new file mode 100644 index 0000000000000000000000000000000000000000..feaadebe6c561fb1cd090dc97f9a4a2af82c3ec4 --- /dev/null +++ b/metrics/npz/val/step-000000754974720.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c9318378d8cd38a06bfdfcf3220efa63ed6f7167d0b0bc760ea6c2c93c65c3f +size 21142 diff --git a/metrics/npz/val/step-000000796917760.npz b/metrics/npz/val/step-000000796917760.npz new file mode 100644 index 0000000000000000000000000000000000000000..e71f7ab949bc4902c3e362b25bcca9de0fc8d888 --- /dev/null +++ b/metrics/npz/val/step-000000796917760.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cefa0a5c00a2f8ab2b0f7af3772e6f044334001bdac3d60d669a19a34df06ccb +size 21142 diff --git a/metrics/npz/val/step-000000838860800.npz b/metrics/npz/val/step-000000838860800.npz new file mode 100644 index 0000000000000000000000000000000000000000..a086e2bb6333a4c7ef3ce0c0f7daa47c4b4bd40b --- /dev/null +++ b/metrics/npz/val/step-000000838860800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a8594fabd68c7420cc26d1e6e397881c79988d1752500a314e35561a5c5bf98 +size 21142 diff --git a/metrics/npz/val/step-000000880803840.npz b/metrics/npz/val/step-000000880803840.npz new file mode 100644 index 0000000000000000000000000000000000000000..6daf785615ce338d17ac17ae0b9b3f67ebdd2161 --- /dev/null +++ b/metrics/npz/val/step-000000880803840.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2726a3f9b014688187bf6b0fbb895ff22175981c2609cd77c3eeae4a8466d8e +size 21142 diff --git a/metrics/npz/val/step-000000922746880.npz b/metrics/npz/val/step-000000922746880.npz new file mode 100644 index 0000000000000000000000000000000000000000..77b1439b73bcbd240839e25c1123ac0a5f2a40a4 --- /dev/null +++ b/metrics/npz/val/step-000000922746880.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:995ec445ae694d3dc987a403bccf692adcff0546a6bcbbeb2e0f6fa1c956ae5c +size 21142 diff --git a/metrics/npz/val/step-000000964689920.npz b/metrics/npz/val/step-000000964689920.npz new file mode 100644 index 0000000000000000000000000000000000000000..19a9445ae993f3049c2d43ad72ccb82b814c6128 --- /dev/null +++ b/metrics/npz/val/step-000000964689920.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b879766e7c6199c7e54ee66544c3bc8fa58361871ead7f28645984339e197f60 +size 21142 diff --git a/metrics/npz/val/step-000001006632960.npz b/metrics/npz/val/step-000001006632960.npz new file mode 100644 index 0000000000000000000000000000000000000000..6d8085effdba85303e75a6fd18bed4146d656221 --- /dev/null +++ b/metrics/npz/val/step-000001006632960.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9604915acd8367a458060d16b904062edb825ad6c21c4cf3ee607637a3d917bb +size 21142 diff --git a/metrics/npz/val/step-000001048576000.npz b/metrics/npz/val/step-000001048576000.npz new file mode 100644 index 0000000000000000000000000000000000000000..fc9c36a8a97f5bd2e111a181f541c6b87f2ac644 --- /dev/null +++ b/metrics/npz/val/step-000001048576000.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57742b6c77c1d9a625665299687292d83226e077a223ed68718661113260c4f8 +size 21142 diff --git a/metrics/npz/val/step-000001090519040.npz b/metrics/npz/val/step-000001090519040.npz new file mode 100644 index 0000000000000000000000000000000000000000..01b647f74ea85f063aa7125208f61a8179059b86 --- /dev/null +++ b/metrics/npz/val/step-000001090519040.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44551e75c7e23831cd4028092c4b5d2927546128956a66c25a6f13763aee3672 +size 21142 diff --git a/metrics/npz/val/step-000001132462080.npz b/metrics/npz/val/step-000001132462080.npz new file mode 100644 index 0000000000000000000000000000000000000000..4bca45d9dd6bb36a23c4366fc38a066075f79ec4 --- /dev/null +++ b/metrics/npz/val/step-000001132462080.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cafad3452ba3f03bd20f351bbfa7f8f2a45347387dfa806cdee643cbe99a1d8 +size 21142 diff --git a/metrics/npz/val/step-000001174405120.npz b/metrics/npz/val/step-000001174405120.npz new file mode 100644 index 0000000000000000000000000000000000000000..ff6672394c1acaec28855943b90d827e99140d99 --- /dev/null +++ b/metrics/npz/val/step-000001174405120.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:558b1e7d7b592ea4158314927922e5b3986957363c724665458d8c6b05b3ddff +size 21142 diff --git a/metrics/npz/val/step-000001216348160.npz b/metrics/npz/val/step-000001216348160.npz new file mode 100644 index 0000000000000000000000000000000000000000..c1a2c7752964691bd0f048f0ab602d90c7bb9db9 --- /dev/null +++ b/metrics/npz/val/step-000001216348160.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52cbf30de30afa313d12ce5d591055ab85ca2d260220675772d50bbb79088370 +size 21142 diff --git a/metrics/npz/val/step-000001258291200.npz b/metrics/npz/val/step-000001258291200.npz new file mode 100644 index 0000000000000000000000000000000000000000..98dd228ce0394b2a916dc771d5a2cf16d86c08e1 --- /dev/null +++ b/metrics/npz/val/step-000001258291200.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92864bf158d5a40171dadcc7527c2d29e5a7328e2c0f89dc39635ca1aff4e7c8 +size 21142 diff --git a/metrics/npz/val/step-000001300234240.npz b/metrics/npz/val/step-000001300234240.npz new file mode 100644 index 0000000000000000000000000000000000000000..f7486d92062f345658f0bb641368568fb4a3e6b5 --- /dev/null +++ b/metrics/npz/val/step-000001300234240.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51f55aa7034b8e4825a9ed454e00d81709850b0395179f160cd06e02de64a7a +size 21142 diff --git a/metrics/npz/val/step-000001342177280.npz b/metrics/npz/val/step-000001342177280.npz new file mode 100644 index 0000000000000000000000000000000000000000..1768123c2d050c79961c7e2379df820564981c86 --- /dev/null +++ b/metrics/npz/val/step-000001342177280.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef848a7db4b517b430ceb6acd99a7c6b5009bc3da71c7859748d13f93d05fa16 +size 21142 diff --git a/metrics/npz/val/step-000001384120320.npz b/metrics/npz/val/step-000001384120320.npz new file mode 100644 index 0000000000000000000000000000000000000000..1570e5c503d48621282c3c60dea68bcf55f15055 --- /dev/null +++ b/metrics/npz/val/step-000001384120320.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:468b40988f9942747141bf1ba560c9cbd1e86a1e2568ddf86d3aa1d1181b727a +size 21142 diff --git a/metrics/npz/val/step-000001426063360.npz b/metrics/npz/val/step-000001426063360.npz new file mode 100644 index 0000000000000000000000000000000000000000..b715a859e6c0a45afe749ac5ba50d08fb897da5e --- /dev/null +++ b/metrics/npz/val/step-000001426063360.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:351c929186f9ea2690337bca3477d9394f3b775bb2605ad46c76ba546efc8b3b +size 21142 diff --git a/metrics/npz/val/step-000001468006400.npz b/metrics/npz/val/step-000001468006400.npz new file mode 100644 index 0000000000000000000000000000000000000000..146ee917870bd6ea0b7ca90bb82721003a08cb4e --- /dev/null +++ b/metrics/npz/val/step-000001468006400.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a768c9bdf6259f83e99f51abf58f0ca35b26a98634c1b3392fbb285fcb360821 +size 21142 diff --git a/metrics/npz/val/step-000001509949440.npz b/metrics/npz/val/step-000001509949440.npz new file mode 100644 index 0000000000000000000000000000000000000000..1c250f3d719bae5b71984f192f24a685acf18db3 --- /dev/null +++ b/metrics/npz/val/step-000001509949440.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfdf529a3d2d31aca56cee18474c40c09b1abcb116e8536888b64c927f7bf27c +size 21142 diff --git a/metrics/npz/val/step-000001551892480.npz b/metrics/npz/val/step-000001551892480.npz new file mode 100644 index 0000000000000000000000000000000000000000..221efb926aed614456cd356bb6dc6cb77e23ad87 --- /dev/null +++ b/metrics/npz/val/step-000001551892480.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d9c57698c688e4020e2688cd25a9c5826942a05162993241a429ade032e31b6 +size 21142 diff --git a/metrics/npz/val/step-000001593835520.npz b/metrics/npz/val/step-000001593835520.npz new file mode 100644 index 0000000000000000000000000000000000000000..74292455ea3ae60bc9499664f42308f17af55864 --- /dev/null +++ b/metrics/npz/val/step-000001593835520.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:065cebfb00d468e2622dc72814b7cfff38891608b03bf65b0f06f628096f9a89 +size 21142 diff --git a/metrics/npz/val/step-000001635778560.npz b/metrics/npz/val/step-000001635778560.npz new file mode 100644 index 0000000000000000000000000000000000000000..7c96ab009b4c1a82c98db4b599943bff7572f06f --- /dev/null +++ b/metrics/npz/val/step-000001635778560.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2634329ec6c251f274c0f7799651a6e78e91a59e1a14930372daeaf3dedf31f7 +size 21142 diff --git a/metrics/npz/val/step-000001677721600.npz b/metrics/npz/val/step-000001677721600.npz new file mode 100644 index 0000000000000000000000000000000000000000..49b79a1b8e0783c3a519ead8a5b2fb6e953685b2 --- /dev/null +++ b/metrics/npz/val/step-000001677721600.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d35581bcb2fe7ad5aa3b2272b62e8360075f87807a0a9fa3bc1e94380061f73 +size 21142 diff --git a/metrics/npz/val/step-000001719664640.npz b/metrics/npz/val/step-000001719664640.npz new file mode 100644 index 0000000000000000000000000000000000000000..2146668c538cc804f762b7a51b03d5a79702a83c --- /dev/null +++ b/metrics/npz/val/step-000001719664640.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6c32a78fa662a8564c92876cebaa175c5e0917db94037819b3ae228d2652049 +size 21142 diff --git a/metrics/npz/val/step-000001761607680.npz b/metrics/npz/val/step-000001761607680.npz new file mode 100644 index 0000000000000000000000000000000000000000..c0e1c85b4b7f5bae1e190d27431d9dce0bb12f5a --- /dev/null +++ b/metrics/npz/val/step-000001761607680.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebbfe70159355e2b345f86f889591817b73061f83fc8c0a120412f2843343158 +size 21142 diff --git a/metrics/npz/val/step-000001803550720.npz b/metrics/npz/val/step-000001803550720.npz new file mode 100644 index 0000000000000000000000000000000000000000..3f00249c328599670d4659dd16741c9429ee3d0d --- /dev/null +++ b/metrics/npz/val/step-000001803550720.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac986eaddce2e4d2424283dac5c8a281ba93c20be63c982f73206740fb3261fb +size 21142 diff --git a/metrics/npz/val/step-000001845493760.npz b/metrics/npz/val/step-000001845493760.npz new file mode 100644 index 0000000000000000000000000000000000000000..49bcddf5e1514e1ae6ac1ba827d2b82e094c4ed7 --- /dev/null +++ b/metrics/npz/val/step-000001845493760.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82d76a621ee366f5d31a831ef56f0bec345e010246d1558e4fcc612854e5aba2 +size 21142 diff --git a/metrics/npz/val/step-000001887436800.npz b/metrics/npz/val/step-000001887436800.npz new file mode 100644 index 0000000000000000000000000000000000000000..519b52afca074d5df78e78204ce4173e60d1677b --- /dev/null +++ b/metrics/npz/val/step-000001887436800.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54064fb5f77655fb5269564468c918aab81bc428f3c433b6c62d234ccbca259b +size 21142 diff --git a/metrics/npz/val/step-000001929379840.npz b/metrics/npz/val/step-000001929379840.npz new file mode 100644 index 0000000000000000000000000000000000000000..6cd3f85dd7b53721df6025ee7155d802341a3973 --- /dev/null +++ b/metrics/npz/val/step-000001929379840.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fe840b94b6a8f5d39ca0ce7796f1bdf24402786b7240212403653050d288ed6 +size 21142 diff --git a/metrics/npz/val/step-000001971322880.npz b/metrics/npz/val/step-000001971322880.npz new file mode 100644 index 0000000000000000000000000000000000000000..90ea9ebea65ea4e47af301a2ebdafb65ed14c100 --- /dev/null +++ b/metrics/npz/val/step-000001971322880.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdbcd019a044c84ea4d618e785a5c2676b972e840c8c9ec282aab8167634f4c1 +size 21142 diff --git a/metrics/npz/val/step-000002013265920.npz b/metrics/npz/val/step-000002013265920.npz new file mode 100644 index 0000000000000000000000000000000000000000..1e0a4dc53a8dfc3c68954d12ef7a9bba106982ab --- /dev/null +++ b/metrics/npz/val/step-000002013265920.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa449756eb452358b1e352e304e54c59345d69603ab5710832b370fdeeee3aa2 +size 21142 diff --git a/metrics/npz/val/step-000002055208960.npz b/metrics/npz/val/step-000002055208960.npz new file mode 100644 index 0000000000000000000000000000000000000000..aae8b3f7a828b95f4fcd7e8cc37d683881c51af9 --- /dev/null +++ b/metrics/npz/val/step-000002055208960.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc379200b3dedb0c28d1fb159a024edde96ceedc380177c4fa083f790ab2fc2f +size 21142 diff --git a/metrics/wandb/wandb_run_id.txt b/metrics/wandb/wandb_run_id.txt new file mode 100644 index 0000000000000000000000000000000000000000..51cd485e9217d198cca484e788f1bf2f0236a3fc --- /dev/null +++ b/metrics/wandb/wandb_run_id.txt @@ -0,0 +1 @@ +lo4di2up \ No newline at end of file diff --git a/model.txt b/model.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5a7180d0a8413695e4ff6cade7069e0e5926476 --- /dev/null +++ b/model.txt @@ -0,0 +1,24 @@ +AlibiForCausalLM( + (model): AlibiModel( + (embeddings): Embedding(50277, 256) + (layers): ModuleList( + (0-1): 2 x TransformerBlock( + (attn_norm): RMSNorm(256, eps=1e-06) + (attn): Attention( + (q_proj): Linear(in_features=256, out_features=256, bias=False) + (k_proj): Linear(in_features=256, out_features=256, bias=False) + (v_proj): Linear(in_features=256, out_features=256, bias=False) + (o_proj): Linear(in_features=256, out_features=256, bias=False) + ) + (mlp_norm): RMSNorm(256, eps=1e-06) + (mlp): TransformerMLP( + (gate_proj): Linear(in_features=256, out_features=1536, bias=False) + (down_proj): Linear(in_features=768, out_features=256, bias=False) + (act_fn): SiLU() + ) + ) + ) + (norm): RMSNorm(256, eps=1e-06) + ) + (lm_head): Linear(in_features=256, out_features=50277, bias=False) +) diff --git a/modeling_transformer.py b/modeling_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b70f526e9824d2cf247dcac4ce13cf288351bddb --- /dev/null +++ b/modeling_transformer.py @@ -0,0 +1,573 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn import functional as F +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_outputs import (BaseModelOutputWithPast, + CausalLMOutputWithPast) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging + +# from fla.layers.attn import Attention +from fla.modules import FusedCrossEntropyLoss, RMSNorm +from fla.modules.activations import swiglu_linear + +from fla.modules import RotaryEmbedding +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import (index_first_axis, pad_input, + unpad_input) +except ImportError: + warnings.warn("Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`") + flash_attn_func = None +from einops import rearrange + +from forgetting_transformer.model.transformer.configuration_transformer import TransformerConfig + +from functools import partial + +logger = logging.get_logger(__name__) + +class Attention(nn.Module): + + def __init__( + self, + hidden_size: int = 2048, + num_heads: int = 32, + num_kv_heads: Optional[int] = None, + window_size: Optional[int] = None, + max_position_embeddings: Optional[int] = None, + rope_base: float = 500000.0, + use_rope: bool = True, + layer_idx: int = None, + ): + super().__init__() + + self.num_heads = num_heads + if num_kv_heads is None: + self.num_kv_heads = self.num_heads + else: + self.num_kv_heads = num_kv_heads + self.num_kv_groups = num_heads // self.num_kv_heads + self.hidden_size = hidden_size + self.head_dim = self.hidden_size // self.num_heads + self.kv_dim = self.num_kv_heads * self.head_dim + self.kv_dim = self.num_kv_heads * self.head_dim + self.window_size = window_size + self.max_position_embeddings = max_position_embeddings + self.layer_idx = layer_idx + + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.kv_dim, bias=False) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + if use_rope: + self.rotary = RotaryEmbedding(self.head_dim, base=rope_base) + else: + self.rotary = None + + + self.apply(self._initialize_weights) + + def _initialize_weights(self, module: nn.Module): + pass + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + batch_size, q_len, _ = hidden_states.size() + q = rearrange(self.q_proj(hidden_states), '... (h d) -> ... h d', h=self.num_heads) + k = rearrange(self.k_proj(hidden_states), '... (h d) -> ... h d', h=self.num_kv_heads) + v = rearrange(self.v_proj(hidden_states), 'b t (h d) -> b h t d', h=self.num_kv_heads) + + seqlen_offset, max_seqlen = 0, q.shape[1] + if past_key_values is not None: + seqlen_offset = past_key_values.get_seq_length(self.layer_idx) + max_seqlen = q.shape[1] + seqlen_offset + + if attention_mask is not None: + # to deliminate the offsets of padding tokens + seqlen_offset = (seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]) + max_seqlen = q.shape[1] + max(seqlen_offset) + + if self.max_position_embeddings is not None: + max_seqlen = max(max_seqlen, self.max_position_embeddings) + if self.rotary is not None: + q, k = self.rotary(q, k, seqlen_offset, max_seqlen) + + k = rearrange(k, 'b t h d -> b h t d') + if past_key_values is not None: + k, v = past_key_values.update(k, v, self.layer_idx) + k, v = rearrange(k, 'b h t d -> b t h d'), rearrange(v, 'b h t d -> b t h d') + if self.num_kv_groups > 1: + k = rearrange(k.unsqueeze(-2).repeat(1, 1, 1, self.num_kv_groups, 1), 'b t h g d -> b t (h g) d') + v = rearrange(v.unsqueeze(-2).repeat(1, 1, 1, self.num_kv_groups, 1), 'b t h g d -> b t (h g) d') + + if flash_attn_func is None: + raise ImportError("Please install Flash Attention via `pip install flash-attn --no-build-isolation` first") + + # Contains at least one padding token in the sequence + if attention_mask is not None: + q, k, v, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(q, k, v, attention_mask, q_len) + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_q, max_seqlen_k = max_seq_lens + o = flash_attn_varlen_func( + q, k, v, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + causal=True, + window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0) + ) + o = pad_input(o, indices_q, batch_size, q_len) + else: + o = flash_attn_func( + q, k, v, + causal=True, + window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0) + ) + o = o.reshape(batch_size, q_len, self.hidden_size) + o = self.o_proj(o) + + if not output_attentions: + attentions = None + + return o, attentions, past_key_values + + def _upad_input(self, q, k, v, attention_mask, q_len): + seqlens = attention_mask.sum(-1, dtype=torch.int32) + indices_k = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_k = seqlens.max().item() + cu_seqlens_k = F.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0)) + batch_size, seq_len, num_key_value_heads, head_dim = k.shape + + k = index_first_axis(k.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k) + v = index_first_axis(v.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k) + if q_len == seq_len: + q = index_first_axis(q.reshape(batch_size * seq_len, self.num_heads, head_dim), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_q = max_seqlen_k + indices_q = indices_k + elif q_len == 1: + max_seqlen_q = 1 + # There is a memcpy here, that is very bad. + cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device) + indices_q = cu_seqlens_q[:-1] + q = q.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -q_len:] + q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask) + + return q, k, v, indices_q, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) + + +class TransformerMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + hidden_ratio: Optional[int] = None, + intermediate_size: Optional[int] = None, + hidden_act: str = 'swish' + ) -> TransformerMLP: + super().__init__() + + self.hidden_size = hidden_size + # the final number of params is `hidden_ratio * hidden_size^2` + # `intermediate_size` is chosen to be a multiple of 256 closest to `2/3 * hidden_size * hidden_ratio` + if hidden_ratio is None: + hidden_ratio = 4 + if intermediate_size is None: + intermediate_size = int(hidden_size * hidden_ratio * 2 / 3) + intermediate_size = 256 * ((intermediate_size + 256 - 1) // 256) + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + y = self.gate_proj(x) + gate, y = y.chunk(2, -1) + # TODO: maybe wrap swiglu_linear in custom_fwd/custom_bwd + return swiglu_linear( + gate, y, + self.down_proj.weight.to(y.dtype), + self.down_proj.bias.to(y.dtype) if self.down_proj.bias is not None else self.down_proj.bias + ) + + +class TransformerBlock(nn.Module): + def __init__(self, config: TransformerConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.attn_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps) + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + window_size=config.window_size, + max_position_embeddings=config.max_position_embeddings, + rope_base=config.rope_base, + use_rope=config.use_rope, + layer_idx=layer_idx + ) + self.mlp_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps) + self.mlp = TransformerMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act + ) + + def forward_attn( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ): + # reisual handled outside + # residual = hidden_states + hidden_states = self.attn_norm(hidden_states) + hidden_states, attentions, past_key_values = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions + ) + return hidden_states, attentions, past_key_values + + def forward_mlp( + self, + hidden_states: torch.Tensor, + residual: torch.Tensor, + ): + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + gradient_checkpointing: bool = False + # **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + + + if gradient_checkpointing: + forward_attn = partial(torch.utils.checkpoint.checkpoint, self.forward_attn, use_reentrant=False) + forward_mlp = partial(torch.utils.checkpoint.checkpoint, self.forward_mlp, use_reentrant=False) + else: + forward_attn = self.forward_attn + forward_mlp = self.forward_mlp + + hidden_states, attentions, past_key_values = forward_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions + ) + + hidden_states = forward_mlp( + hidden_states, + residual, + ) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attentions,) + + if use_cache: + outputs += (past_key_values,) + + return outputs + + + +class TransformerPreTrainedModel(PreTrainedModel): + + config_class = TransformerConfig + supports_gradient_checkpointing = True + _no_split_modules = ['TransformerBlock'] + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class TransformerModel(TransformerPreTrainedModel): + + def __init__(self, config: TransformerConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([TransformerBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]) + self.norm = RMSNorm(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None + ) -> Union[Tuple, CausalLMOutputWithPast]: + if output_attentions: + warnings.warn( + "`TransformerModel` does not support output attention weights now, so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + next_decoder_cache = None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + gradient_checkpointing=self.gradient_checkpointing and self.training + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attns] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class TransformerForCausalLM(TransformerPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = TransformerModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is passed along. + if past_key_values is not None: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': kwargs.get('use_cache'), + 'attention_mask': attention_mask, + }) + return model_inputs + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + hidden_states = outputs[0] + + loss = None + if labels is not None: + if self.config.fuse_cross_entropy: + loss_fct = FusedCrossEntropyLoss(inplace_backward=True, reduction='none') + else: + loss_fct = nn.CrossEntropyLoss(reduction='none') + logits = self.lm_head(hidden_states) + # Enable model parallelism + labels = labels.to(logits.device) + # labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], loss_fct.ignore_index)), 1) + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + loss = loss.view(*labels.size()) + del logits + logits = None + else: + logits = self.lm_head(hidden_states) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/no_decay_params.txt b/no_decay_params.txt new file mode 100644 index 0000000000000000000000000000000000000000..7acf4201eb540c492bd36f923aa1466d9d07cb09 --- /dev/null +++ b/no_decay_params.txt @@ -0,0 +1,5 @@ +_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight +_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight +_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight +_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight +_forward_module._fsdp_wrapped_module.model.norm.weight